-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support Llama index for vLLM native (#692)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
- Loading branch information
1 parent
391c4a5
commit 2e41dcf
Showing
17 changed files
with
1,032 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# HABANA environment | ||
# FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as hpu | ||
FROM opea/habanalabs:1.16.1-pytorch-installer-2.2.2 as hpu | ||
|
||
ENV LANG=en_US.UTF-8 | ||
ARG REPO=https://github.com/huggingface/optimum-habana.git | ||
ARG REPO_VER=v1.12.1 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ | ||
git-lfs \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
RUN git lfs install | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \ | ||
pip install --no-cache-dir git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 | ||
|
||
RUN git clone ${REPO} /home/user/optimum-habana && \ | ||
cd /home/user/optimum-habana && git checkout ${REPO_VER} && \ | ||
cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \ | ||
cd /home/user/comps/llms/text-generation/native/langchain && \ | ||
pip install --no-cache-dir -r requirements.txt && \ | ||
pip install --no-cache-dir --upgrade --force-reinstall pydantic | ||
|
||
ENV PYTHONPATH=/root:/home/user | ||
|
||
WORKDIR /home/user/comps/llms/text-generation/native/langchain | ||
|
||
ENTRYPOINT ["python", "llm.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# LLM Native Microservice | ||
|
||
LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware. | ||
|
||
## 🚀1. Start Microservice | ||
|
||
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a Native LLM service with docker. | ||
|
||
### 1.1 Setup Environment Variables | ||
|
||
In order to start Native LLM service, you need to setup the following environment variables first. | ||
|
||
```bash | ||
export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct" | ||
``` | ||
|
||
### 1.2 Build Docker Image | ||
|
||
```bash | ||
cd ../../../../../ | ||
docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/llama_index/Dockerfile . | ||
``` | ||
|
||
To start a docker container, you have two options: | ||
|
||
- A. Run Docker with CLI | ||
- B. Run Docker with Docker Compose | ||
|
||
You can choose one as needed. | ||
|
||
### 1.3 Run Docker with CLI (Option A) | ||
|
||
```bash | ||
docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} opea/llm-native:latest | ||
``` | ||
|
||
### 1.4 Run Docker with Docker Compose (Option B) | ||
|
||
```bash | ||
docker compose -f docker_compose_llm.yaml up -d | ||
``` | ||
|
||
## 🚀2. Consume LLM Service | ||
|
||
### 2.1 Check Service Status | ||
|
||
```bash | ||
curl http://${your_ip}:9000/v1/health_check\ | ||
-X GET \ | ||
-H 'Content-Type: application/json' | ||
``` | ||
|
||
### 2.2 Consume LLM Service | ||
|
||
```bash | ||
curl http://${your_ip}:9000/v1/chat/completions\ | ||
-X POST \ | ||
-d '{"query":"What is Deep Learning?"}' \ | ||
-H 'Content-Type: application/json' | ||
``` |
28 changes: 28 additions & 0 deletions
28
comps/llms/text-generation/native/llama_index/docker_compose_llm.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
version: "3.8" | ||
|
||
services: | ||
llm: | ||
image: opea/llm-native:latest | ||
container_name: llm-native-server | ||
ports: | ||
- "9000:9000" | ||
runtime: habana | ||
cap_add: | ||
- SYS_NICE | ||
ipc: host | ||
environment: | ||
no_proxy: ${no_proxy} | ||
http_proxy: ${http_proxy} | ||
https_proxy: ${https_proxy} | ||
LLM_NATIVE_MODEL: ${LLM_NATIVE_MODEL} | ||
HABANA_VISIBLE_DEVICES: all | ||
OMPI_MCA_btl_vader_single_copy_mechanism: none | ||
TOKENIZERS_PARALLELISM: false | ||
restart: unless-stopped | ||
|
||
networks: | ||
default: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
# Copyright (c) 2024 Intel Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import sys | ||
|
||
sys.path.append("/test/GenAIComps/") | ||
|
||
import logging | ||
import os | ||
import threading | ||
import time | ||
|
||
import torch | ||
from llama_index.core import PromptTemplate | ||
from template import ChatTemplate, args_dict, input_sentences | ||
from utils import initialize_model | ||
|
||
from comps import ( | ||
GeneratedDoc, | ||
LLMParamsDoc, | ||
ServiceType, | ||
opea_microservices, | ||
register_microservice, | ||
register_statistics, | ||
) | ||
|
||
logflag = os.getenv("LOGFLAG", False) | ||
|
||
logging.basicConfig( | ||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | ||
datefmt="%m/%d/%Y %H:%M:%S", | ||
level=logging.INFO, | ||
) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Args: | ||
def __init__(self, **entries): | ||
self.__dict__.update(entries) | ||
|
||
|
||
model = None | ||
assistant_model = None | ||
tokenizer = None | ||
generation_config = None | ||
args = Args(**args_dict) | ||
initialization_lock = threading.Lock() | ||
initialized = False | ||
|
||
|
||
def generate( | ||
input_query: list, | ||
device="hpu", | ||
use_lazy_mode=True, | ||
use_hpu_graphs=True, | ||
profiling_steps=0, | ||
profiling_warmup_steps=0, | ||
ignore_eos=True, | ||
profiling_record_shapes=False, | ||
): | ||
"""Generates sequences from the input sentences and returns them.""" | ||
logger.info(f"[llm - generate] starting to inference with prompt {input_query}") | ||
encode_t0 = time.perf_counter() | ||
|
||
# Tokenization | ||
input_tokens = tokenizer.batch_encode_plus(input_query, return_tensors="pt", padding=True) | ||
encode_duration = time.perf_counter() - encode_t0 | ||
logger.info(f"[llm - generate] input tokenized: {input_tokens}") | ||
|
||
# Move inputs to target device(s) | ||
for t in input_tokens: | ||
logger.info(f"[llm - generate] t: {t}") | ||
if torch.is_tensor(input_tokens[t]): | ||
logger.info("[llm - generate] input[t] is tensor") | ||
logger.info(f"[llm - generate] device: {model.device}") | ||
input_tokens[t] = input_tokens[t].to(model.device) | ||
|
||
logger.info("[llm - generate] inputs transferred.") | ||
|
||
iteration_times = [] | ||
outputs = model.generate( | ||
**input_tokens, | ||
generation_config=generation_config, | ||
assistant_model=assistant_model, | ||
lazy_mode=use_lazy_mode, | ||
hpu_graphs=use_hpu_graphs, | ||
profiling_steps=profiling_steps, | ||
profiling_warmup_steps=profiling_warmup_steps, | ||
ignore_eos=ignore_eos, | ||
iteration_times=iteration_times, | ||
profiling_record_shapes=profiling_record_shapes, | ||
).cpu() | ||
logger.info("[llm - generate] result generated") | ||
first_token_time = iteration_times[0] + encode_duration | ||
result = tokenizer.batch_decode(outputs, skip_special_tokens=True) | ||
logger.info(f"[llm - generate] result: {result}") | ||
logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms") | ||
return result | ||
|
||
|
||
def initialize(): | ||
global model, assistant_model, tokenizer, generation_config, initialized | ||
with initialization_lock: | ||
if not initialized: | ||
# initialize model and tokenizer | ||
import habana_frameworks.torch.hpu as torch_hpu | ||
from optimum.habana.utils import HabanaProfile | ||
|
||
model, assistant_model, tokenizer, generation_config = initialize_model(args, logger) | ||
logger.info("[llm] model and tokenizer initialized.") | ||
|
||
# compilation and model warmup | ||
HabanaProfile.disable() | ||
logger.info("[llm - native] Graph compilation...") | ||
for _ in range(args.warmup): | ||
generate(input_sentences) | ||
logger.info("[llm - native] model warm up finished.") | ||
torch_hpu.synchronize() | ||
HabanaProfile.enable() | ||
logger.info("[llm - native] Ready to inference") | ||
res = generate(["What is Deep Learning?"]) | ||
logger.info(f"[llm - native] test result: {res}") | ||
initialized = True | ||
|
||
|
||
@register_microservice( | ||
name="opea_service@llm_native_llamaindex", | ||
service_type=ServiceType.LLM, | ||
endpoint="/v1/chat/completions", | ||
host="0.0.0.0", | ||
port=9000, | ||
) | ||
@register_statistics(names=["opea_service@llm_native_llamaindex"]) | ||
def llm_generate(input: LLMParamsDoc): | ||
initialize() | ||
if logflag: | ||
logger.info(input) | ||
prompt = input.query | ||
prompt_template = None | ||
if input.chat_template: | ||
prompt_template = PromptTemplate(input.chat_template) | ||
input_variables = prompt_template.template_vars | ||
if prompt_template: | ||
if sorted(input_variables) == ["context", "question"]: | ||
prompt = prompt_template.format(question=input.query, context="\n".join(input.documents)) | ||
elif input_variables == ["question"]: | ||
prompt = prompt_template.format(question=input.query) | ||
else: | ||
logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") | ||
else: | ||
if input.documents: | ||
prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents) | ||
res = generate([prompt]) | ||
|
||
if logflag: | ||
logger.info(f"[llm - native] inference result: {res}") | ||
return GeneratedDoc(text=res[0], prompt=input.query) | ||
|
||
|
||
if __name__ == "__main__": | ||
opea_microservices["opea_service@llm_native_llamaindex"].start() |
10 changes: 10 additions & 0 deletions
10
comps/llms/text-generation/native/llama_index/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
docarray | ||
fastapi | ||
httpx | ||
llama_index | ||
opentelemetry-api | ||
opentelemetry-exporter-otlp | ||
opentelemetry-sdk | ||
prometheus-fastapi-instrumentator | ||
shortuuid | ||
uvicorn |
Oops, something went wrong.