-
Notifications
You must be signed in to change notification settings - Fork 8
/
test_vllm.py
56 lines (45 loc) · 1.86 KB
/
test_vllm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# copy pasted from https://docs.vllm.ai/en/latest/getting_started/quickstart.html
# do export VLLM_USE_MODELSCOPE=True
import vllm
from vllm import LLM, SamplingParams
import torch
def test_pytorch():
print('\n----- Test PyTorch ---')
# Print the PyTorch version and CUDA version
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
# Perform a matrix multiplication on CUDA and print the result
result = torch.randn(2, 4).cuda() @ torch.randn(4, 1).cuda()
print(f"Matrix multiplication result: {result}")
# Check CUDA availability and device details
print(f'Number of CUDA devices: {torch.cuda.device_count()}')
if torch.cuda.device_count() > 0:
print(f'Device name: {torch.cuda.get_device_name(0)}')
else:
print("No CUDA devices available.")
def test_vllm():
print('\n----- Test vLLM ---')
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="facebook/opt-125m")
# llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
outputs: iter = llm.generate(prompts, sampling_params)
print(f'{type(outputs)=}')
print(f'{type(outputs[0])=}')
# Print the outputs.
output: vllm.outputs.RequestOutput
for output in outputs:
prompt: str = output.prompt
generated_text: str = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
import time
start_time = time.time()
test_pytorch()
test_vllm()
print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")