diff --git a/tests/conftest.py b/tests/conftest.py index 67885b93285c5..79846bb024dba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,14 +11,15 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, BatchEncoding) -from vllm import LLM, SamplingParams from vllm.config import TokenizerPoolConfig, VisionLanguageConfig from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) +from vllm.entrypoints.llm import LLM from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData +from vllm.sampling_params import SamplingParams from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py new file mode 100644 index 0000000000000..948778b2c22ed --- /dev/null +++ b/tests/entrypoints/conftest.py @@ -0,0 +1,72 @@ +import pytest + + +@pytest.fixture +def sample_regex(): + return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + + +@pytest.fixture +def sample_json_schema(): + return { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "number" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work_history"] + } + + +@pytest.fixture +def sample_guided_choice(): + return [ + "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", + "Ruby", "Swift", "Kotlin" + ] + + +@pytest.fixture +def sample_sql_statements(): + return (""" +start: select_statement + +select_statement: "SELECT" column "from" table "where" condition + +column: "col_1" | "col_2" +table: "table_1" | "table_2" +condition: column "=" number + +number: "1" | "2" +""") diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index fb32a9d155bc0..a8792d7ae9737 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -4,67 +4,22 @@ import torch from transformers import AutoTokenizer -from vllm.entrypoints.openai.protocol import CompletionRequest from vllm.model_executor.guided_decoding import ( - get_guided_decoding_logits_processor) + GuidedDecodingFields, get_guided_decoding_logits_processor) from vllm.model_executor.guided_decoding.outlines_logits_processors import ( JSONLogitsProcessor, RegexLogitsProcessor) -TEST_SCHEMA = { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "age": { - "type": "integer" - }, - "skills": { - "type": "array", - "items": { - "type": "string", - "maxLength": 10 - }, - "minItems": 3 - }, - "work history": { - "type": "array", - "items": { - "type": "object", - "properties": { - "company": { - "type": "string" - }, - "duration": { - "type": "string" - }, - "position": { - "type": "string" - } - }, - "required": ["company", "position"] - } - } - }, - "required": ["name", "age", "skills", "work history"] -} -TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") - -pytestmark = pytest.mark.openai - - -def test_guided_logits_processors(): +def test_guided_logits_processors(sample_regex, sample_json_schema): """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') - regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) - json_LP = JSONLogitsProcessor(TEST_SCHEMA, + regex_LP = RegexLogitsProcessor(sample_regex, tokenizer) + json_LP = JSONLogitsProcessor(sample_json_schema, tokenizer, whitespace_pattern=None) token_ids = tokenizer.encode( - f"Give an example IPv4 address with this regex: {TEST_REGEX}") + f"Give an example IPv4 address with this regex: {sample_regex}") tensor = torch.rand(32000) original_tensor = torch.clone(tensor) regex_LP(token_ids, tensor) @@ -72,7 +27,8 @@ def test_guided_logits_processors(): assert not torch.allclose(tensor, original_tensor) token_ids = tokenizer.encode( - f"Give an employee profile that fits this schema: {TEST_SCHEMA}") + f"Give an employee profile that fits this schema: {sample_json_schema}" + ) tensor = torch.rand(32000) original_tensor = torch.clone(tensor) json_LP(token_ids, tensor) @@ -82,15 +38,15 @@ def test_guided_logits_processors(): @pytest.mark.asyncio @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_logits_processor_black_box(backend: str): +async def test_guided_logits_processor_black_box(sample_regex, + sample_json_schema, + backend: str): tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') token_ids = tokenizer.encode( - f"Give an example IPv4 address with this regex: {TEST_REGEX}") - regex_request = CompletionRequest(model='test', - prompt=token_ids, - guided_regex=TEST_REGEX) - regex_lp = await get_guided_decoding_logits_processor( - backend, regex_request, tokenizer) + f"Give an example IPv4 address with this regex: {sample_regex}") + regex_lp = get_guided_decoding_logits_processor( + GuidedDecodingFields(guided_regex=sample_regex, + guided_decoding_backend=backend), tokenizer) assert regex_lp is not None tensor = torch.rand(32000) original_tensor = torch.clone(tensor) @@ -99,12 +55,11 @@ async def test_guided_logits_processor_black_box(backend: str): assert not torch.allclose(tensor, original_tensor) token_ids = tokenizer.encode( - f"Give an employee profile that fits this schema: {TEST_SCHEMA}") - json_request = CompletionRequest(model='test', - prompt=token_ids, - guided_json=TEST_SCHEMA) - json_lp = await get_guided_decoding_logits_processor( - backend, json_request, tokenizer) + f"Give an employee profile that fits this schema: {sample_json_schema}" + ) + json_lp = get_guided_decoding_logits_processor( + GuidedDecodingFields(guided_json=sample_json_schema, + guided_decoding_backend=backend), tokenizer) assert json_lp is not None tensor = torch.rand(32000) original_tensor = torch.clone(tensor) diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index a00fff91a310e..88477a7cc9832 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -1,13 +1,18 @@ +import json +import re import weakref from typing import List +import jsonschema import pytest -from vllm import LLM, RequestOutput, SamplingParams +from vllm.entrypoints.llm import LLM +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams from ..conftest import cleanup -MODEL_NAME = "facebook/opt-125m" +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" PROMPTS = [ "Hello, my name is", @@ -15,7 +20,6 @@ "The capital of France is", "The future of AI is", ] - TOKEN_IDS = [ [0], [0, 1], @@ -30,11 +34,7 @@ def llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection - llm = LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) + llm = LLM(model=MODEL_NAME, max_model_len=1024) with llm.deprecate_legacy_api(): yield weakref.proxy(llm) @@ -119,6 +119,13 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): @pytest.mark.skip_global_cleanup def test_multiple_sampling_params(llm: LLM): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = [ SamplingParams(temperature=0.01, top_p=0.95), SamplingParams(temperature=0.3, top_p=0.95), @@ -140,5 +147,119 @@ def test_multiple_sampling_params(llm: LLM): assert len(PROMPTS) == len(outputs) # sampling_params is None, default params should be applied - outputs = llm.generate(PROMPTS, sampling_params=None) - assert len(PROMPTS) == len(outputs) + outputs = llm.generate(prompts, sampling_params=None) + assert len(prompts) == len(outputs) + + +@pytest.mark.skip_global_cleanup +def test_guided_regex(sample_regex, llm): + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + ) + outputs = llm.generate(prompts=[ + f"Give an example IPv4 address with this regex: {sample_regex}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True, + guided_options=dict(guided_regex=sample_regex)) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + assert re.fullmatch(sample_regex, generated_text) is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +@pytest.mark.skip_global_cleanup +def test_guided_json_completion(sample_json_schema, llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + ) + outputs = llm.generate(prompts=[ + f"Give an example JSON for an employee profile " + f"that fits this schema: {sample_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True, + guided_options=dict(guided_json=sample_json_schema)) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, schema=sample_json_schema) + + +@pytest.mark.skip_global_cleanup +def test_guided_choice_completion(sample_guided_choice, llm): + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + ) + outputs = llm.generate( + prompts="The best language for type-safe systems programming is ", + sampling_params=sampling_params, + use_tqdm=True, + guided_options=dict(guided_choice=sample_guided_choice)) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + assert generated_text in sample_guided_choice + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +@pytest.mark.skip_global_cleanup +def test_guided_grammar(sample_sql_statements, llm): + + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + ) + outputs = llm.generate( + prompts=("Generate a sql state that select col_1 from " + "table_1 where it is equals to 1"), + sampling_params=sampling_params, + use_tqdm=True, + guided_options=dict(guided_grammar=sample_sql_statements)) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + + # use Lark to parse the output, and make sure it's a valid parse tree + from lark import Lark + parser = Lark(sample_sql_statements) + parser.parse(generated_text) + + # remove spaces for comparison b/c we removed them in the grammar + ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace( + " ", "") + + assert generated_text.strip() == ground_truth + + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c22a675ff1230..698fc09accdc7 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -24,53 +24,6 @@ # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" -TEST_SCHEMA = { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "age": { - "type": "integer" - }, - "skills": { - "type": "array", - "items": { - "type": "string", - "maxLength": 10 - }, - "minItems": 3 - }, - "work history": { - "type": "array", - "items": { - "type": "object", - "properties": { - "company": { - "type": "string" - }, - "duration": { - "type": "string" - }, - "position": { - "type": "string" - } - }, - "required": ["company", "position"] - } - } - }, - "required": ["name", "age", "skills", "work history"] -} - -TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") - -TEST_CHOICE = [ - "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", - "Swift", "Kotlin" -] - pytestmark = pytest.mark.openai @@ -115,7 +68,7 @@ def server(zephyr_lora_files, ray_ctx): def client(server): return server.get_async_client() - +@pytest.mark.asyncio async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data @@ -127,1086 +80,1102 @@ async def test_check_models(client: openai.AsyncOpenAI): assert lora_models[1].id == "zephyr-lora2" -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], -) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 5 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], -) -async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=None, - ) - choice = completion.choices[0] - assert choice.logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora hereafter - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=0, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert len(choice.logprobs.top_logprobs[0]) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=5, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, - model_name: str): - - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=21, - ) - ... - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - stream = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=30, - stream=True, - ) - async for chunk in stream: - ... - - # the server should still work afterwards - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], -) -async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=False) - - choice = chat_completion.choices[0] - assert choice.logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora hereafter - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=0) - - choice = chat_completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.content is not None - assert len(choice.logprobs.content[0].top_logprobs) == 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=5) - - choice = chat_completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.content is not None - assert len(choice.logprobs.content[0].top_logprobs) == 5 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, - model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - # Default max_logprobs is 20, so this should raise an error - with pytest.raises((openai.BadRequestError, openai.APIError)): - stream = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=21, - stream=True) - async for chunk in stream: - ... - - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=30, - stream=False) - - # the server should still work afterwards - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - stream=False) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_single_chat_session(client: openai.AsyncOpenAI, - model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) - assert chat_completion.id is not None - assert len(chat_completion.choices) == 1 - - choice = chat_completion.choices[0] - assert choice.finish_reason == "length" - assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=37, total_tokens=47) - - message = choice.message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" - messages.append({"role": "assistant", "content": message.content}) - - # test multi-turn dialogue - messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_streaming(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is an LLM?" - - single_completion = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - ) - single_output = single_completion.choices[0].text - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True) - chunks: List[str] = [] - finish_reason_count = 0 - async for chunk in stream: - chunks.append(chunk.choices[0].text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == "length" - assert chunk.choices[0].text - assert "".join(chunks) == single_output - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora hereafter - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - # test single completion - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - ) - output = chat_completion.choices[0].message.content - stop_reason = chat_completion.choices[0].finish_reason - - # test streaming - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - ) - chunks: List[str] = [] - finish_reason_count = 0 - async for chunk in stream: - delta = chunk.choices[0].delta - if delta.role: - assert delta.role == "assistant" - if delta.content: - chunks.append(delta.content) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == stop_reason - assert delta.content - assert "".join(chunks) == output - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], -) -async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, - model_name: str): - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "What is the capital of France?" - }] - - # Test stream=True, stream_options={"include_usage": False} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": False}) - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options={"include_usage": True} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) - - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - else: - assert chunk.usage is None - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=False, stream_options={"include_usage": None} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options={"include_usage": True} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], -) -async def test_completion_stream_options(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options={"include_usage": False} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": False}) - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options={"include_usage": True} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - else: - assert chunk.usage is None - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=False, stream_options={"include_usage": None} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options={"include_usage": True} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora hereafter - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test both text and token IDs - for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=prompts, - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary - # for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] - - -@pytest.mark.asyncio -async def test_logits_bias(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 5 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - token_id = 1000 - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token_id): 100}, - seed=42, - ) - assert len(completion.choices[0].text) >= 5 - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), - add_special_tokens=False)["input_ids"] - assert all([ - response == expected - for response, expected in zip(response_tokens, expected_tokens) - ]) - - # Test ban - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - ) - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - first_response = completion.choices[0].text - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token): -100 - for token in response_tokens}, - ) - assert first_response != completion.choices[0].text - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_json_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example JSON for an employee profile " - f"that fits this schema: {TEST_SCHEMA}", - n=3, - temperature=1.0, - max_tokens=500, - extra_body=dict(guided_json=TEST_SCHEMA, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - output_json = json.loads(completion.choices[i].text) - jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_json_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {TEST_SCHEMA}" - }] - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - extra_body=dict(guided_json=TEST_SCHEMA, - guided_decoding_backend=guided_decoding_backend)) - message = chat_completion.choices[0].message - assert message.content is not None - json1 = json.loads(message.content) - jsonschema.validate(instance=json1, schema=TEST_SCHEMA) - - messages.append({"role": "assistant", "content": message.content}) - messages.append({ - "role": - "user", - "content": - "Give me another one with a different name and age" - }) - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - extra_body=dict(guided_json=TEST_SCHEMA, - guided_decoding_backend=guided_decoding_backend)) - message = chat_completion.choices[0].message - assert message.content is not None - json2 = json.loads(message.content) - jsonschema.validate(instance=json2, schema=TEST_SCHEMA) - assert json1["name"] != json2["name"] - assert json1["age"] != json2["age"] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}", - n=3, - temperature=1.0, - max_tokens=20, - extra_body=dict(guided_regex=TEST_REGEX, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example IP address with this regex: {TEST_REGEX}" - }] - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=20, - extra_body=dict(guided_regex=TEST_REGEX, - guided_decoding_backend=guided_decoding_backend)) - ip1 = chat_completion.choices[0].message.content - assert ip1 is not None - assert re.fullmatch(TEST_REGEX, ip1) is not None - - messages.append({"role": "assistant", "content": ip1}) - messages.append({"role": "user", "content": "Give me a different one"}) - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=20, - extra_body=dict(guided_regex=TEST_REGEX, - guided_decoding_backend=guided_decoding_backend)) - ip2 = chat_completion.choices[0].message.content - assert ip2 is not None - assert re.fullmatch(TEST_REGEX, ip2) is not None - assert ip1 != ip2 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - completion = await client.completions.create( - model=MODEL_NAME, - prompt="The best language for type-safe systems programming is ", - n=2, - temperature=1.0, - max_tokens=10, - extra_body=dict(guided_choice=TEST_CHOICE, - guided_decoding_backend=guided_decoding_backend)) - - assert completion.id is not None - assert len(completion.choices) == 2 - for i in range(2): - assert completion.choices[i].text in TEST_CHOICE - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - "The best language for type-safe systems programming is " - }] - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - extra_body=dict(guided_choice=TEST_CHOICE, - guided_decoding_backend=guided_decoding_backend)) - choice1 = chat_completion.choices[0].message.content - assert choice1 in TEST_CHOICE - - messages.append({"role": "assistant", "content": choice1}) - messages.append({ - "role": "user", - "content": "I disagree, pick another one" - }) - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - extra_body=dict(guided_choice=TEST_CHOICE, - guided_decoding_backend=guided_decoding_backend)) - choice2 = chat_completion.choices[0].message.content - assert choice2 in TEST_CHOICE - assert choice1 != choice2 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example JSON that fits this schema: 42", - extra_body=dict(guided_json=42, - guided_decoding_backend=guided_decoding_backend)) - - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - "The best language for type-safe systems programming is " - }] - with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create(model=MODEL_NAME, - messages=messages, - extra_body=dict(guided_regex={ - 1: "Python", - 2: "C++" - })) - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example string that fits this regex", - extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA)) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - "The best language for type-safe systems programming is " - }] - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5, - extra_body=dict(guided_choice=TEST_CHOICE, - guided_decoding_backend=guided_decoding_backend)) - - assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.content is not None - top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs - - # -9999.0 is the minimum logprob returned by OpenAI - for item in top_logprobs: - assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})" - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) -async def test_named_tool_use(client: openai.AsyncOpenAI, - guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {TEST_SCHEMA}" - }] - - # non-streaming - - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": TEST_SCHEMA - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }) - message = chat_completion.choices[0].message - assert len(message.content) == 0 - json_string = message.tool_calls[0].function.arguments - json1 = json.loads(json_string) - jsonschema.validate(instance=json1, schema=TEST_SCHEMA) - - messages.append({"role": "assistant", "content": json_string}) - messages.append({ - "role": - "user", - "content": - "Give me another one with a different name and age" - }) - - # streaming - - stream = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": TEST_SCHEMA - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }, - stream=True) - - output = [] - finish_reason_count = 0 - async for chunk in stream: - delta = chunk.choices[0].delta - if delta.role: - assert delta.role == "assistant" - assert delta.content is None or len(delta.content) == 0 - if delta.tool_calls: - output.append(delta.tool_calls[0].function.arguments) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - json2 = json.loads("".join(output)) - jsonschema.validate(instance=json2, schema=TEST_SCHEMA) - assert json1["name"] != json2["name"] - assert json1["age"] != json2["age"] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) -async def test_required_tool_use_not_yet_supported( - client: openai.AsyncOpenAI, guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {TEST_SCHEMA}" - }] - - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": TEST_SCHEMA - } - }], - tool_choice="required") - - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": TEST_SCHEMA - } - }], - tool_choice="auto") - - -@pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) -async def test_inconsistent_tool_choice_and_tools( - client: openai.AsyncOpenAI, guided_decoding_backend: str): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {TEST_SCHEMA}" - }] - - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create(model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tool_choice={ - "type": "function", - "function": { - "name": - "dummy_function_name" - } - }) - - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": TEST_SCHEMA - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "nondefined_function_name" - } - }) - - -@pytest.mark.asyncio -async def test_response_format_json_object(client: openai.AsyncOpenAI): - for _ in range(2): - resp = await client.chat.completions.create( - model=MODEL_NAME, - messages=[{ - "role": - "user", - "content": ('what is 1+1? please respond with a JSON object, ' - 'the format is {"result": 2}') - }], - response_format={"type": "json_object"}) - - content = resp.choices[0].message.content - assert content is not None - - loaded = json.loads(content) - assert loaded == {"result": 2}, loaded +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # first test base model, then test loras +# "model_name", +# [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +# ) +# async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): +# completion = await client.completions.create(model=model_name, +# prompt="Hello, my name is", +# max_tokens=5, +# temperature=0.0) + +# assert completion.id is not None +# assert completion.choices is not None and len(completion.choices) == 1 + +# choice = completion.choices[0] +# assert len(choice.text) >= 5 +# assert choice.finish_reason == "length" +# assert completion.usage == openai.types.CompletionUsage( +# completion_tokens=5, prompt_tokens=6, total_tokens=11) + +# # test using token IDs +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# ) +# assert len(completion.choices[0].text) >= 5 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # first test base model, then test loras +# "model_name", +# [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +# ) +# async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): +# # test using token IDs +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# logprobs=None, +# ) +# choice = completion.choices[0] +# assert choice.logprobs is None + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # just test 1 lora hereafter +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): +# # test using token IDs +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# logprobs=0, +# ) +# choice = completion.choices[0] +# assert choice.logprobs is not None +# assert choice.logprobs.token_logprobs is not None +# assert choice.logprobs.top_logprobs is not None +# assert len(choice.logprobs.top_logprobs[0]) == 1 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): +# # test using token IDs +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# logprobs=5, +# ) +# choice = completion.choices[0] +# assert choice.logprobs is not None +# assert choice.logprobs.token_logprobs is not None +# assert choice.logprobs.top_logprobs is not None +# assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, +# model_name: str): + +# with pytest.raises( +# (openai.BadRequestError, openai.APIError)): # test using token IDs +# await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# # vLLM has higher default max_logprobs (20 instead of 5) to support +# # both Completion API and Chat Completion API +# logprobs=21, +# ) +# ... +# with pytest.raises( +# (openai.BadRequestError, openai.APIError)): # test using token IDs +# stream = await client.completions.create( +# model=MODEL_NAME, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# # vLLM has higher default max_logprobs (20 instead of 5) to support +# # both Completion API and Chat Completion API +# logprobs=30, +# stream=True, +# ) +# async for chunk in stream: +# ... + +# # the server should still work afterwards +# completion = await client.completions.create( +# model=model_name, +# prompt=[0, 0, 0, 0, 0], +# max_tokens=5, +# temperature=0.0, +# ) +# assert len(completion.choices[0].text) >= 0 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # first test base model, then test loras +# "model_name", +# [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +# ) +# async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# chat_completion = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=5, +# temperature=0.0, +# logprobs=False) + +# choice = chat_completion.choices[0] +# assert choice.logprobs is None + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # just test 1 lora hereafter +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# chat_completion = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=5, +# temperature=0.0, +# logprobs=True, +# top_logprobs=0) + +# choice = chat_completion.choices[0] +# assert choice.logprobs is not None +# assert choice.logprobs.content is not None +# assert len(choice.logprobs.content[0].top_logprobs) == 0 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# chat_completion = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=5, +# temperature=0.0, +# logprobs=True, +# top_logprobs=5) + +# choice = chat_completion.choices[0] +# assert choice.logprobs is not None +# assert choice.logprobs.content is not None +# assert len(choice.logprobs.content[0].top_logprobs) == 5 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, +# model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# # Default max_logprobs is 20, so this should raise an error +# with pytest.raises((openai.BadRequestError, openai.APIError)): +# stream = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=10, +# logprobs=True, +# top_logprobs=21, +# stream=True) +# async for chunk in stream: +# ... + +# with pytest.raises(openai.BadRequestError): +# await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=10, +# logprobs=True, +# top_logprobs=30, +# stream=False) + +# # the server should still work afterwards +# chat_completion = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=10, +# stream=False) +# message = chat_completion.choices[0].message +# assert message.content is not None and len(message.content) >= 0 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_single_chat_session(client: openai.AsyncOpenAI, +# model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# # test single completion +# chat_completion = await client.chat.completions.create(model=model_name, +# messages=messages, +# max_tokens=10, +# logprobs=True, +# top_logprobs=5) +# assert chat_completion.id is not None +# assert len(chat_completion.choices) == 1 + +# choice = chat_completion.choices[0] +# assert choice.finish_reason == "length" +# assert chat_completion.usage == openai.types.CompletionUsage( +# completion_tokens=10, prompt_tokens=37, total_tokens=47) + +# message = choice.message +# assert message.content is not None and len(message.content) >= 10 +# assert message.role == "assistant" +# messages.append({"role": "assistant", "content": message.content}) + +# # test multi-turn dialogue +# messages.append({"role": "user", "content": "express your result in json"}) +# chat_completion = await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# ) +# message = chat_completion.choices[0].message +# assert message.content is not None and len(message.content) >= 0 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_completion_streaming(client: openai.AsyncOpenAI, +# model_name: str): +# prompt = "What is an LLM?" + +# single_completion = await client.completions.create( +# model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# ) +# single_output = single_completion.choices[0].text +# stream = await client.completions.create(model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# stream=True) +# chunks: List[str] = [] +# finish_reason_count = 0 +# async for chunk in stream: +# chunks.append(chunk.choices[0].text) +# if chunk.choices[0].finish_reason is not None: +# finish_reason_count += 1 +# # finish reason should only return in last block +# assert finish_reason_count == 1 +# assert chunk.choices[0].finish_reason == "length" +# assert chunk.choices[0].text +# assert "".join(chunks) == single_output + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # just test 1 lora hereafter +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": "user", +# "content": "what is 1+1?" +# }] + +# # test single completion +# chat_completion = await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# ) +# output = chat_completion.choices[0].message.content +# stop_reason = chat_completion.choices[0].finish_reason + +# # test streaming +# stream = await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# stream=True, +# ) +# chunks: List[str] = [] +# finish_reason_count = 0 +# async for chunk in stream: +# delta = chunk.choices[0].delta +# if delta.role: +# assert delta.role == "assistant" +# if delta.content: +# chunks.append(delta.content) +# if chunk.choices[0].finish_reason is not None: +# finish_reason_count += 1 +# # finish reason should only return in last block +# assert finish_reason_count == 1 +# assert chunk.choices[0].finish_reason == stop_reason +# assert delta.content +# assert "".join(chunks) == output + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +# ) +# async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, +# model_name: str): +# messages = [{ +# "role": "system", +# "content": "You are a helpful assistant." +# }, { +# "role": "user", +# "content": "What is the capital of France?" +# }] + +# # Test stream=True, stream_options={"include_usage": False} +# stream = await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# stream=True, +# stream_options={"include_usage": False}) +# async for chunk in stream: +# assert chunk.usage is None + +# # Test stream=True, stream_options={"include_usage": True} +# stream = await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# stream=True, +# stream_options={"include_usage": True}) + +# async for chunk in stream: +# if chunk.choices[0].finish_reason is None: +# assert chunk.usage is None +# else: +# assert chunk.usage is None +# final_chunk = await stream.__anext__() +# assert final_chunk.usage is not None +# assert final_chunk.usage.prompt_tokens > 0 +# assert final_chunk.usage.completion_tokens > 0 +# assert final_chunk.usage.total_tokens == ( +# final_chunk.usage.prompt_tokens + +# final_chunk.usage.completion_tokens) +# assert final_chunk.choices == [] + +# # Test stream=False, stream_options={"include_usage": None} +# with pytest.raises(BadRequestError): +# await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# stream=False, +# stream_options={"include_usage": None}) + +# # Test stream=False, stream_options={"include_usage": True} +# with pytest.raises(BadRequestError): +# await client.chat.completions.create( +# model=model_name, +# messages=messages, +# max_tokens=10, +# temperature=0.0, +# stream=False, +# stream_options={"include_usage": True}) + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# "model_name", +# ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +# ) +# async def test_completion_stream_options(client: openai.AsyncOpenAI, +# model_name: str): +# prompt = "What is the capital of France?" + +# # Test stream=True, stream_options={"include_usage": False} +# stream = await client.completions.create( +# model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# stream=True, +# stream_options={"include_usage": False}) +# async for chunk in stream: +# assert chunk.usage is None + +# # Test stream=True, stream_options={"include_usage": True} +# stream = await client.completions.create( +# model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# stream=True, +# stream_options={"include_usage": True}) +# async for chunk in stream: +# if chunk.choices[0].finish_reason is None: +# assert chunk.usage is None +# else: +# assert chunk.usage is None +# final_chunk = await stream.__anext__() +# assert final_chunk.usage is not None +# assert final_chunk.usage.prompt_tokens > 0 +# assert final_chunk.usage.completion_tokens > 0 +# assert final_chunk.usage.total_tokens == ( +# final_chunk.usage.prompt_tokens + +# final_chunk.usage.completion_tokens) +# assert final_chunk.choices == [] + +# # Test stream=False, stream_options={"include_usage": None} +# with pytest.raises(BadRequestError): +# await client.completions.create(model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# stream=False, +# stream_options={"include_usage": None}) + +# # Test stream=False, stream_options={"include_usage": True} +# with pytest.raises(BadRequestError): +# await client.completions.create(model=model_name, +# prompt=prompt, +# max_tokens=5, +# temperature=0.0, +# stream=False, +# stream_options={"include_usage": True}) + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize( +# # just test 1 lora hereafter +# "model_name", +# [MODEL_NAME, "zephyr-lora"], +# ) +# async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): +# # test both text and token IDs +# for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): +# # test simple list +# batch = await client.completions.create( +# model=model_name, +# prompt=prompts, +# max_tokens=5, +# temperature=0.0, +# ) +# assert len(batch.choices) == 2 +# assert batch.choices[0].text == batch.choices[1].text + +# # test n = 2 +# batch = await client.completions.create( +# model=model_name, +# prompt=prompts, +# n=2, +# max_tokens=5, +# temperature=0.0, +# extra_body=dict( +# # NOTE: this has to be true for n > 1 in vLLM, but not necessary +# # for official client. +# use_beam_search=True), +# ) +# assert len(batch.choices) == 4 +# assert batch.choices[0].text != batch.choices[ +# 1].text, "beam search should be different" +# assert batch.choices[0].text == batch.choices[ +# 2].text, "two copies of the same prompt should be the same" +# assert batch.choices[1].text == batch.choices[ +# 3].text, "two copies of the same prompt should be the same" + +# # test streaming +# batch = await client.completions.create( +# model=model_name, +# prompt=prompts, +# max_tokens=5, +# temperature=0.0, +# stream=True, +# ) +# texts = [""] * 2 +# async for chunk in batch: +# assert len(chunk.choices) == 1 +# choice = chunk.choices[0] +# texts[choice.index] += choice.text +# assert texts[0] == texts[1] + + +# @pytest.mark.asyncio +# async def test_logits_bias(client: openai.AsyncOpenAI): +# prompt = "Hello, my name is" +# max_tokens = 5 +# tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + +# # Test exclusive selection +# token_id = 1000 +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=prompt, +# max_tokens=max_tokens, +# temperature=0.0, +# logit_bias={str(token_id): 100}, +# seed=42, +# ) +# assert len(completion.choices[0].text) >= 5 +# response_tokens = tokenizer(completion.choices[0].text, +# add_special_tokens=False)["input_ids"] +# expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), +# add_special_tokens=False)["input_ids"] +# assert all([ +# response == expected +# for response, expected in zip(response_tokens, expected_tokens) +# ]) + +# # Test ban +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=prompt, +# max_tokens=max_tokens, +# temperature=0.0, +# ) +# response_tokens = tokenizer(completion.choices[0].text, +# add_special_tokens=False)["input_ids"] +# first_response = completion.choices[0].text +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=prompt, +# max_tokens=max_tokens, +# temperature=0.0, +# logit_bias={str(token): -100 +# for token in response_tokens}, +# ) +# assert first_response != completion.choices[0].text + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_json_completion(server, sample_json_schema, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=f"Give an example JSON for an employee profile " +# f"that fits this schema: {sample_json_schema}", +# n=3, +# temperature=1.0, +# max_tokens=500, +# extra_body=dict(guided_json=sample_json_schema)) + +# assert completion.id is not None +# assert len(completion.choices) == 3 +# for i in range(3): +# output_json = json.loads(completion.choices[i].text) +# jsonschema.validate(instance=output_json, schema=sample_json_schema) + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_json_chat(server, sample_json_schema, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# f"Give an example JSON for an employee profile that " +# f"fits this schema: {sample_json_schema}" +# }] +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=500, +# extra_body=dict(guided_json=sample_json_schema, +# guided_decoding_backend=guided_decoding_backend)) +# message = chat_completion.choices[0].message +# assert message.content is not None +# json1 = json.loads(message.content) +# jsonschema.validate(instance=json1, schema=sample_json_schema) + +# messages.append({"role": "assistant", "content": message.content}) +# messages.append({ +# "role": +# "user", +# "content": +# "Give me another one with a different name and age" +# }) +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=500, +# extra_body=dict(guided_json=sample_json_schema, +# guided_decoding_backend=guided_decoding_backend)) +# message = chat_completion.choices[0].message +# assert message.content is not None +# json2 = json.loads(message.content) +# jsonschema.validate(instance=json2, schema=sample_json_schema) +# assert json1["name"] != json2["name"] +# assert json1["age"] != json2["age"] + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_regex_completion(server, sample_regex, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt=f"Give an example IPv4 address with this regex: {sample_regex}", +# n=3, +# temperature=1.0, +# max_tokens=20, +# extra_body=dict(guided_regex=sample_regex, +# guided_decoding_backend=guided_decoding_backend)) + +# assert completion.id is not None +# assert len(completion.choices) == 3 +# for i in range(3): +# assert completion.choices[i].text is not None +# assert re.fullmatch(sample_regex, +# completion.choices[i].text) is not None + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_regex_chat(server, sample_regex, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# f"Give an example IP address with this regex: {sample_regex}" +# }] +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=20, +# extra_body=dict(guided_regex=sample_regex, +# guided_decoding_backend=guided_decoding_backend)) +# ip1 = chat_completion.choices[0].message.content +# assert ip1 is not None +# assert re.fullmatch(sample_regex, ip1) is not None + +# messages.append({"role": "assistant", "content": ip1}) +# messages.append({"role": "user", "content": "Give me a different one"}) +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=20, +# extra_body=dict(guided_regex=sample_regex, +# guided_decoding_backend=guided_decoding_backend)) +# ip2 = chat_completion.choices[0].message.content +# assert ip2 is not None +# assert re.fullmatch(sample_regex, ip2) is not None +# assert ip1 != ip2 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_choice_completion(server, sample_guided_choice, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# completion = await client.completions.create( +# model=MODEL_NAME, +# prompt="The best language for type-safe systems programming is ", +# n=2, +# temperature=1.0, +# max_tokens=10, +# extra_body=dict(guided_choice=sample_guided_choice, +# guided_decoding_backend=guided_decoding_backend)) + +# assert completion.id is not None +# assert len(completion.choices) == 2 +# for i in range(2): +# assert completion.choices[i].text in sample_guided_choice + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_choice_chat(server, sample_guided_choice, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# "The best language for type-safe systems programming is " +# }] +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=10, +# extra_body=dict(guided_choice=sample_guided_choice, +# guided_decoding_backend=guided_decoding_backend)) +# choice1 = chat_completion.choices[0].message.content +# assert choice1 in sample_guided_choice + +# messages.append({"role": "assistant", "content": choice1}) +# messages.append({ +# "role": "user", +# "content": "I disagree, pick another one" +# }) +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=10, +# extra_body=dict(guided_choice=sample_guided_choice, +# guided_decoding_backend=guided_decoding_backend)) +# choice2 = chat_completion.choices[0].message.content +# assert choice2 in sample_guided_choice +# assert choice1 != choice2 + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_decoding_type_error(server, sample_regex, +# sample_json_schema, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# with pytest.raises(openai.BadRequestError): +# _ = await client.completions.create( +# model=MODEL_NAME, +# prompt="Give an example JSON that fits this schema: 42", +# extra_body=dict(guided_json=42, +# guided_decoding_backend=guided_decoding_backend)) + +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# "The best language for type-safe systems programming is " +# }] +# with pytest.raises(openai.BadRequestError): +# _ = await client.chat.completions.create(model=MODEL_NAME, +# messages=messages, +# extra_body=dict(guided_regex={ +# 1: "Python", +# 2: "C++" +# })) + +# with pytest.raises(openai.BadRequestError): +# _ = await client.completions.create( +# model=MODEL_NAME, +# prompt="Give an example string that fits this regex", +# extra_body=dict(guided_regex=sample_regex, +# guided_json=sample_json_schema)) + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_guided_choice_chat_logprobs(server, sample_guided_choice, +# client: openai.AsyncOpenAI, +# guided_decoding_backend: str): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# "The best language for type-safe systems programming is " +# }] +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=10, +# logprobs=True, +# top_logprobs=5, +# extra_body=dict(guided_choice=sample_guided_choice, +# guided_decoding_backend=guided_decoding_backend)) + +# assert chat_completion.choices[0].logprobs is not None +# assert chat_completion.choices[0].logprobs.content is not None +# top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs + +# # -9999.0 is the minimum logprob returned by OpenAI +# for item in top_logprobs: +# assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})" + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", +# ["outlines", "lm-format-enforcer"]) +# async def test_named_tool_use(client: openai.AsyncOpenAI, +# guided_decoding_backend: str, +# sample_json_schema): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# f"Give an example JSON for an employee profile that " +# f"fits this schema: {sample_json_schema}" +# }] + +# # non-streaming + +# chat_completion = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tools=[{ +# "type": "function", +# "function": { +# "name": "dummy_function_name", +# "description": "This is a dummy function", +# "parameters": sample_json_schema +# } +# }], +# tool_choice={ +# "type": "function", +# "function": { +# "name": "dummy_function_name" +# } +# }) +# message = chat_completion.choices[0].message +# assert len(message.content) == 0 +# json_string = message.tool_calls[0].function.arguments +# json1 = json.loads(json_string) +# jsonschema.validate(instance=json1, schema=sample_json_schema) + +# messages.append({"role": "assistant", "content": json_string}) +# messages.append({ +# "role": +# "user", +# "content": +# "Give me another one with a different name and age" +# }) + +# # streaming + +# stream = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tools=[{ +# "type": "function", +# "function": { +# "name": "dummy_function_name", +# "description": "This is a dummy function", +# "parameters": sample_json_schema +# } +# }], +# tool_choice={ +# "type": "function", +# "function": { +# "name": "dummy_function_name" +# } +# }, +# stream=True) + +# output = [] +# finish_reason_count = 0 +# async for chunk in stream: +# delta = chunk.choices[0].delta +# if delta.role: +# assert delta.role == "assistant" +# assert delta.content is None or len(delta.content) == 0 +# if delta.tool_calls: +# output.append(delta.tool_calls[0].function.arguments) +# if chunk.choices[0].finish_reason is not None: +# finish_reason_count += 1 +# # finish reason should only return in last block +# assert finish_reason_count == 1 +# json2 = json.loads("".join(output)) +# jsonschema.validate(instance=json2, schema=sample_json_schema) +# assert json1["name"] != json2["name"] +# assert json1["age"] != json2["age"] + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) +# async def test_required_tool_use_not_yet_supported( +# client: openai.AsyncOpenAI, guided_decoding_backend: str, +# sample_json_schema): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# f"Give an example JSON for an employee profile that " +# f"fits this schema: {sample_json_schema}" +# }] + +# with pytest.raises(openai.BadRequestError): +# await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tools=[{ +# "type": "function", +# "function": { +# "name": "dummy_function_name", +# "description": "This is a dummy function", +# "parameters": sample_json_schema +# } +# }], +# tool_choice="required") + +# with pytest.raises(openai.BadRequestError): +# await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tools=[{ +# "type": "function", +# "function": { +# "name": "dummy_function_name", +# "description": "This is a dummy function", +# "parameters": sample_json_schema +# } +# }], +# tool_choice="auto") + + +# @pytest.mark.asyncio +# @pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) +# async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, +# guided_decoding_backend: str, +# sample_json_schema): +# messages = [{ +# "role": "system", +# "content": "you are a helpful assistant" +# }, { +# "role": +# "user", +# "content": +# f"Give an example JSON for an employee profile that " +# f"fits this schema: {sample_json_schema}" +# }] + +# with pytest.raises(openai.BadRequestError): +# await client.chat.completions.create(model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tool_choice={ +# "type": "function", +# "function": { +# "name": +# "dummy_function_name" +# } +# }) + +# with pytest.raises(openai.BadRequestError): +# await client.chat.completions.create( +# model=MODEL_NAME, +# messages=messages, +# max_tokens=1000, +# tools=[{ +# "type": "function", +# "function": { +# "name": "dummy_function_name", +# "description": "This is a dummy function", +# "parameters": sample_json_schema +# } +# }], +# tool_choice={ +# "type": "function", +# "function": { +# "name": "nondefined_function_name" +# } +# }) + + +# @pytest.mark.asyncio +# async def test_response_format_json_object(client: openai.AsyncOpenAI): +# for _ in range(2): +# result_format = {"result": "2"} +# resp = await client.chat.completions.create( +# model=MODEL_NAME, +# messages=[{ +# "role": +# "user", +# "content": 'what is 1+1? please respond with a JSON object with the format: {"result": "integer"}' +# }], +# response_format={"type": "json_object"}) + +# content = resp.choices[0].message.content +# print(content) +# assert content is not None + +# loaded = json.loads(content) +# print(loaded) +# assert loaded == {"result": 2}, loaded @pytest.mark.asyncio @@ -1275,34 +1244,22 @@ async def test_custom_role(client: openai.AsyncOpenAI): content2 = resp2.choices[0].message.content assert content1 == content2 - @pytest.mark.asyncio -async def test_guided_grammar(client: openai.AsyncOpenAI): - simple_sql_grammar = """ -start: select_statement - -select_statement: "SELECT" column "from" table "where" condition - -column: "col_1" | "col_2" -table: "table_1" | "table_2" -condition: column "=" number - -number: "1" | "2" -""" - +async def test_guided_grammar(server, sample_sql_statements, + client: openai.AsyncOpenAI): completion = await client.completions.create( model=MODEL_NAME, prompt=("Generate a sql state that select col_1 from " "table_1 where it is equals to 1"), temperature=1.0, max_tokens=500, - extra_body=dict(guided_grammar=simple_sql_grammar)) + extra_body=dict(guided_grammar=sample_sql_statements)) content = completion.choices[0].text # use Lark to parse the output, and make sure it's a valid parse tree from lark import Lark - parser = Lark(simple_sql_grammar) + parser = Lark(sample_sql_statements) parser.parse(content) # remove spaces for comparison b/c we removed them in the grammar diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 8759ee06795b8..73a27d2e00cf3 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import enum from abc import ABC, abstractmethod from typing import List diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9e923493160ed..987e21cc2b5a3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,5 +1,6 @@ from contextlib import contextmanager -from typing import ClassVar, List, Optional, Sequence, Union, cast, overload +from typing import (ClassVar, Dict, List, Optional, Sequence, Union, cast, + overload) from tqdm import tqdm from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -11,6 +12,8 @@ parse_and_batch_prompt) from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.guided_decoding import ( + GuidedDecodingFields, get_guided_decoding_logits_processor) from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -250,6 +253,7 @@ def generate( prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None ) -> List[RequestOutput]: """Generates the completions for the input prompts. @@ -260,9 +264,9 @@ def generate( Args: inputs: A list of inputs to generate completions for. sampling_params: The sampling parameters for text generation. If - None, we use the default sampling parameters. - When it is a single value, it is applied to every prompt. - When it is a list, the list must have the same length as the + None, we use the default sampling parameters. + When it is a single value, it is applied to every prompt. + When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -291,15 +295,18 @@ def generate( Union[PromptStrictInputs, Sequence[PromptStrictInputs]], prompts) + if isinstance(guided_options, Dict) and len(guided_options) > 1: + raise ValueError( + "You can only use one kind of guided decoding but multiple is " + f"specified: {self.__dict__}") if sampling_params is None: # Use default sampling params. sampling_params = SamplingParams() - self._validate_and_add_requests( - inputs=inputs, - params=sampling_params, - lora_request=lora_request, - ) + self._validate_and_add_requests(inputs=inputs, + params=sampling_params, + lora_request=lora_request, + guided_options=guided_options) outputs = self._run_engine(use_tqdm=use_tqdm) return LLMEngine.validate_outputs(outputs, RequestOutput) @@ -499,22 +506,33 @@ def _validate_and_add_requests( params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, Sequence[PoolingParams]], lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], + guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None ) -> None: + if isinstance(inputs, (str, dict)): # Convert a single prompt to a list. inputs = [inputs] num_requests = len(inputs) - - if isinstance(params, list) and len(params) != num_requests: - raise ValueError("The lengths of prompts and params " - "must be the same.") if isinstance(lora_request, list) and len(lora_request) != num_requests: raise ValueError("The lengths of prompts and lora_request " "must be the same.") + if params is None: + # Use default sampling params. + params = [SamplingParams()] * num_requests + elif isinstance(params, list): + if len(params) != num_requests: + raise ValueError("The lengths of prompts and params " + "must be the same.") + + params = [ + self._add_guided_processor(param, guided_options) + for param in params if isinstance(param, SamplingParams) + ] + elif isinstance(params, SamplingParams): + params = self._add_guided_processor(params, guided_options) - # Add requests to the engine. for i, request_inputs in enumerate(inputs): self._add_request( request_inputs, @@ -523,6 +541,42 @@ def _validate_and_add_requests( lora_request, Sequence) else lora_request, ) + def _add_guided_processor( + self, + params: SamplingParams, + guided_options: Optional[Union[Dict, + "GuidedDecodingFields"]] = None): + if guided_options: + if isinstance(guided_options, dict): + guided_options = GuidedDecodingFields(**guided_options) + if guided_options.guided_decoding_backend is None: + decoding_config = self.llm_engine.get_decoding_config() + guided_options.guided_decoding_backend = ( + decoding_config.guided_decoding_backend) + guided_logits_processor = get_guided_decoding_logits_processor( + guided_options, self.get_tokenizer()) + if guided_logits_processor: + if params.logits_processors is None: + params.logits_processors = [] + params.logits_processors.append(guided_logits_processor) + return params + + # def _add_guided_processor(self, params: SamplingParams): + # if options := params.guided_options: + # if isinstance(options, dict): + # options = GuidedDecodingFields(**options) + # if options.guided_decoding_backend is None: + # decoding_config = self.llm_engine.get_decoding_config() + # options.guided_decoding_backend = ( + # decoding_config.guided_decoding_backend) + # guided_logits_processor = get_guided_decoding_logits_processor( + # options, self.get_tokenizer()) + # if guided_logits_processor: + # if params.logits_processors is None: + # params.logits_processors = [] + # params.logits_processors.append(guided_logits_processor) + # return params + def _add_request( self, inputs: PromptInputs, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 744e1d94511b3..ff022ae4fa44f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -25,7 +25,7 @@ from vllm.inputs import PromptInputs from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( - get_guided_decoding_logits_processor) + GuidedDecodingFields, get_guided_decoding_logits_processor_async, get_guided_decoding_logits_processor_async) from vllm.multimodal.image import ImagePixelData from vllm.multimodal.utils import (async_get_and_parse_image, get_full_image_text_prompt) @@ -247,18 +247,20 @@ async def create_chat_completion( add_special_tokens=request.add_special_tokens) sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) - decoding_config = await self.engine.get_decoding_config() - guided_decoding_backend = request.guided_decoding_backend \ - or decoding_config.guided_decoding_backend - guided_decode_logits_processor = ( - await get_guided_decoding_logits_processor( - guided_decoding_backend, request, await - self.engine.get_tokenizer())) - if guided_decode_logits_processor: + + # request = adapt_request_for_tool_use(request) + # options = GuidedDecodingFields.from_openai_request(request) + if request.guided_decoding_backend is None: + decoding_config = await self.engine.get_decoding_config() + request.guided_decoding_backend = ( + decoding_config.guided_decoding_backend) + processors = (await get_guided_decoding_logits_processor_async( + request, await self.engine.get_tokenizer())) + if processors: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] - sampling_params.logits_processors.append( - guided_decode_logits_processor) + sampling_params.logits_processors.append(processors) + except ValueError as e: return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c775fa6daa739..410344f5c678c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -21,7 +21,7 @@ OpenAIServing) from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( - get_guided_decoding_logits_processor) + GuidedDecodingFields, get_guided_decoding_logits_processor_async, get_guided_decoding_logits_processor_async) from vllm.outputs import RequestOutput from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -98,18 +98,17 @@ async def create_completion(self, request: CompletionRequest, try: sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) - decoding_config = await self.engine.get_decoding_config() - guided_decoding_backend = request.guided_decoding_backend \ - or decoding_config.guided_decoding_backend - guided_decode_logit_processor = ( - await get_guided_decoding_logits_processor( - guided_decoding_backend, request, await - self.engine.get_tokenizer())) - if guided_decode_logit_processor is not None: + if request.guided_decoding_backend is None: + decoding_config = await self.engine.get_decoding_config() + request.guided_decoding_backend = ( + decoding_config.guided_decoding_backend) + processors = (await get_guided_decoding_logits_processor_async( + request, await self.engine.get_tokenizer())) + if processors is not None: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] - sampling_params.logits_processors.append( - guided_decode_logit_processor) + sampling_params.logits_processors.append(processors) + prompt_is_tokens, prompts = parse_prompt_format(request.prompt) for i, prompt in enumerate(prompts): diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 50aa3ec379f4a..a8041f647a4d1 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,33 +1,95 @@ +import asyncio +import concurrent.futures from typing import Optional, Union from vllm.entrypoints.openai.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, CompletionRequest) -from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( - get_lm_format_enforcer_guided_decoding_logits_processor) +from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields from vllm.model_executor.guided_decoding.outlines_decoding import ( - get_outlines_guided_decoding_logits_processor) + get_outlines_guided_decoding_logits_processor, get_outlines_guided_decoding_logits_processor_async) from vllm.sampling_params import LogitsProcessor +global_thread_pool = None -async def get_guided_decoding_logits_processor( - guided_decoding_backend: str, request: Union[CompletionRequest, - ChatCompletionRequest], - tokenizer) -> Optional[LogitsProcessor]: - request = _adapt_request_for_tool_use(request) - if guided_decoding_backend == 'outlines': - return await get_outlines_guided_decoding_logits_processor( +async def get_guided_decoding_logits_processor_async( + request: Union[CompletionRequest, + ChatCompletionRequest], tokenizer) -> Optional[LogitsProcessor]: + global global_thread_pool + if global_thread_pool is None: + global_thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=4) + loop = asyncio.get_running_loop() + + return await loop.run_in_executor( + global_thread_pool, + get_guided_decoding_logits_processor, + request, + tokenizer, + ) +# async def get_guided_decoding_logits_processor_async( +# guided_decoding_backend: str, request: Union[CompletionRequest, +# ChatCompletionRequest], +# tokenizer) -> Optional[LogitsProcessor]: +# request = _adapt_request_for_tool_use(request) + +# if guided_decoding_backend == 'outlines': +# return await get_outlines_guided_decoding_logits_processor_async( +# request, tokenizer) +# if guided_decoding_backend == 'lm-format-enforcer': +# from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa +# get_lm_format_enforcer_guided_decoding_logits_processor) +# options = GuidedDecodingFields.from_openai_request(request) +# return get_lm_format_enforcer_guided_decoding_logits_processor( +# options, tokenizer) + + # raise ValueError( + # f"Unknown guided decoding backend '{guided_decoding_backend}'. " + # "Must be one of 'outlines, 'lm-format-enforcer'") + + +# async def get_guided_decoding_logits_processor( +# guided_decoding_backend: str, request: Union[CompletionRequest, +# ChatCompletionRequest], +# tokenizer) -> Optional[LogitsProcessor]: +# request = _adapt_request_for_tool_use(request) + +# if guided_decoding_backend == 'outlines': +# return await get_outlines_guided_decoding_logits_processor( +# request, tokenizer) +# if guided_decoding_backend == 'lm-format-enforcer': +# return await get_lm_format_enforcer_guided_decoding_logits_processor( +# request, tokenizer) + +# raise ValueError( +# f"Unknown guided decoding backend '{guided_decoding_backend}'. " +# "Must be one of 'outlines, 'lm-format-enforcer'") + + +def get_guided_decoding_logits_processor( + request: Union[CompletionRequest, + ChatCompletionRequest, GuidedDecodingFields], tokenizer) -> Optional[LogitsProcessor]: + # request = _adapt_request_for_tool_use(request) + if request.guided_decoding_backend == 'outlines': + return get_outlines_guided_decoding_logits_processor( request, tokenizer) - if guided_decoding_backend == 'lm-format-enforcer': - return await get_lm_format_enforcer_guided_decoding_logits_processor( + if request.guided_decoding_backend == 'lm-format-enforcer': + ## Import moved inside function to avoide circular + ## import with vllm.entrypoints.LLM.py + from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa + get_lm_format_enforcer_guided_decoding_logits_processor) + return get_lm_format_enforcer_guided_decoding_logits_processor( request, tokenizer) raise ValueError( - f"Unknown guided decoding backend '{guided_decoding_backend}'. " + f"Unknown guided decoding backend '{request.guided_decoding_backend}'. " "Must be one of 'outlines, 'lm-format-enforcer'") +__all__ = ['get_guided_decoding_logits_processor', 'GuidedDecodingFields'] + + def _adapt_request_for_tool_use(request: Union[CompletionRequest, ChatCompletionRequest]): # the legacy completion API does not support tool use diff --git a/vllm/model_executor/guided_decoding/fields.py b/vllm/model_executor/guided_decoding/fields.py new file mode 100644 index 0000000000000..2c2f7e2f005d0 --- /dev/null +++ b/vllm/model_executor/guided_decoding/fields.py @@ -0,0 +1,48 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + CompletionRequest) + + +@dataclass +class GuidedDecodingFields: + """One of the fields will be used to retrieve the logit processor.""" + guided_json: Optional[Union[Dict, BaseModel, str]] = None + guided_regex: Optional[str] = None + guided_choice: Optional[List[str]] = None + guided_grammar: Optional[str] = None + guided_decoding_backend: Optional[str] = None + guided_whitespace_pattern: Optional[str] = None + guided_json_object: Optional[bool] = None + + def __post_init__(self): + """Validate that some fields are mutually exclusive.""" + guide_count = sum([ + self.guided_json is not None, + self.guided_regex is not None, + self.guided_choice is not None, + self.guided_grammar is not None, + self.guided_json_object is not None, + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding but multiple is " + f"specified: {self.__dict__}") + + @classmethod + def from_openai_request(cls, request: Union[CompletionRequest, + ChatCompletionRequest]): + is_json_object = (request.response_format is not None + and request.response_format.type == "json_object") + return cls( + guided_json=request.guided_json, + guided_regex=request.guided_regex, + guided_choice=request.guided_choice, + guided_grammar=request.guided_grammar, + guided_decoding_backend=request.guided_decoding_backend, + guided_whitespace_pattern=request.guided_whitespace_pattern or " ", + guided_json_object=is_json_object or None, + ) diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index d0a5ca5592f9d..24173ddc050a6 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -1,7 +1,9 @@ from functools import lru_cache from json import loads as json_loads from typing import Optional, Union - +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + CompletionRequest) from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser, RegexParser, StringParser, TokenEnforcerTokenizerData, UnionParser) @@ -10,16 +12,15 @@ from pydantic import BaseModel from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - CompletionRequest) +from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields from vllm.model_executor.guided_decoding.outlines_decoding import ( get_outlines_guided_decoding_logits_processor) from vllm.sampling_params import LogitsProcessor -async def get_lm_format_enforcer_guided_decoding_logits_processor( - request: Union[CompletionRequest, ChatCompletionRequest], - tokenizer) -> Optional[LogitsProcessor]: +def get_lm_format_enforcer_guided_decoding_logits_processor( + request: Union[CompletionRequest, + ChatCompletionRequest, GuidedDecodingFields], tokenizer) -> Optional[LogitsProcessor]: """ Given an OpenAI-compatible request, check for guided decoding parameters and get the necessary logits processor for the given guide. @@ -40,12 +41,11 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor( character_level_parser = RegexParser(request.guided_regex) elif request.guided_grammar: # CFG grammar not supported by LMFE, revert to outlines - return await get_outlines_guided_decoding_logits_processor( + return get_outlines_guided_decoding_logits_processor( request, tokenizer) - elif (request.response_format is not None - and request.response_format.type == "json_object"): - character_level_parser = JsonSchemaParser( - None) # None means any json object + elif isinstance(request, GuidedDecodingFields) and request.guided_json_object: + # None means any json object + character_level_parser = JsonSchemaParser(None) else: return None diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index 721f7e0530cb7..668064d3974da 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,18 +1,19 @@ -import asyncio -import concurrent.futures +from copy import copy from enum import Enum +from functools import lru_cache from json import dumps as json_dumps from re import escape as regex_escape -from typing import Tuple, Union +from typing import Optional, Tuple, Union from pydantic import BaseModel from transformers import PreTrainedTokenizerBase - from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest) +from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields from vllm.model_executor.guided_decoding.outlines_logits_processors import ( CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor) - +import concurrent.futures +import asyncio class GuidedDecodingMode(Enum): JSON = "json" @@ -51,7 +52,7 @@ class GuidedDecodingMode(Enum): global_thread_pool = None # used for generating logits processor fsm -async def get_outlines_guided_decoding_logits_processor( +async def get_outlines_guided_decoding_logits_processor_async( request: Union[CompletionRequest, ChatCompletionRequest], tokenizer: PreTrainedTokenizerBase ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor, @@ -69,16 +70,37 @@ async def get_outlines_guided_decoding_logits_processor( if global_thread_pool is None: global_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=2) + max_workers=4) loop = asyncio.get_running_loop() return await loop.run_in_executor(global_thread_pool, - _get_logits_processor, guide, tokenizer, + _get_cached_logits_processor, guide, tokenizer, mode, request.guided_whitespace_pattern) +def get_outlines_guided_decoding_logits_processor( + request: Union[CompletionRequest, + ChatCompletionRequest, GuidedDecodingFields], tokenizer +) -> Optional[Union[JSONLogitsProcessor, RegexLogitsProcessor]]: + """ + Given an OpenAI-compatible request, check for guided decoding parameters + and get the necessary logits processor for the given guide. + We cache logit processors by (guide, tokenizer), and on cache hit + we make a shallow copy to reuse the same underlying FSM. + """ + guide, mode = _get_guide_and_mode(request) + if not guide or not mode: + return None + + logits_processor = copy( + _get_cached_logits_processor(guide, tokenizer, mode, + request.guided_whitespace_pattern)) + return logits_processor + + def _get_guide_and_mode( - request: Union[CompletionRequest, ChatCompletionRequest] + request: Union[CompletionRequest, + ChatCompletionRequest, GuidedDecodingFields] ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]: if request.guided_json: @@ -102,17 +124,20 @@ def _get_guide_and_mode( return choices_regex, GuidedDecodingMode.CHOICE elif request.guided_grammar: return request.guided_grammar, GuidedDecodingMode.GRAMMAR - elif (request.response_format is not None + elif (not isinstance(request, GuidedDecodingFields) and request.response_format is not None and request.response_format.type == "json_object"): + print("Used response format", flush=True) return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR else: return None, None -def _get_logits_processor( - guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode, - whitespace_pattern: Union[str, None] -) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]: + +@lru_cache(maxsize=32) +def _get_cached_logits_processor(guide: str, + tokenizer: PreTrainedTokenizerBase, + mode: GuidedDecodingMode, + whitespace_pattern: Union[str, None]): if mode == GuidedDecodingMode.JSON: return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern) elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE: diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 9d8a361353e26..81b336297c395 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -104,6 +104,8 @@ class SamplingParams: truncate_prompt_tokens: If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None (i.e., no truncation). + guided_options: Configuration dictionary for guided decoding. Refer to + the `GuidedDecodingFields` class for the available options. """ def __init__( @@ -134,6 +136,7 @@ def __init__( spaces_between_special_tokens: bool = True, logits_processors: Optional[List[LogitsProcessor]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + # guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None ) -> None: self.n = n self.best_of = best_of if best_of is not None else n @@ -182,6 +185,8 @@ def __init__( else: self.output_text_buffer_length = 0 + # self.guided_options = guided_options + self._verify_args() if self.use_beam_search: self._verify_beam_search() @@ -342,4 +347,4 @@ def __repr__(self) -> str: f"skip_special_tokens={self.skip_special_tokens}, " "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " - f"truncate_prompt_tokens={self.truncate_prompt_tokens})") + f"truncate_prompt_tokens={self.truncate_prompt_tokens}),")