diff --git a/comps/llms/text-generation/vllm-xft/README.md b/comps/llms/text-generation/vllm-xft/README.md new file mode 100644 index 000000000..abc3b1885 --- /dev/null +++ b/comps/llms/text-generation/vllm-xft/README.md @@ -0,0 +1,47 @@ +vLLM-xFT is a fork of vLLM to integrate the xfastertransformer backend, maintaining compatibility with most of the official vLLM's features. +For usage of vllm-xFT, please refer to [xFasterTransformer/vllm-xft](https://github.com/intel/xFasterTransformer/blob/main/serving/vllm-xft.md) + +# 🚀 Start Microservice with Docker + +## 1 Build Docker Image + +```bash +cd ../../../ +docker build -t opea/llm-vllm-xft:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm-xft/docker/Dockerfile . +``` + +## 2 Run Docker with CLI + +```bash +docker run -it -p 9000:9000 -v /home/sdp/Qwen2-7B-Instruct/:/Qwen2-7B-Instruct/ -e vLLM_LLM_ENDPOINT="http://localhost:18688" -e HF_DATASET_DIR="/Qwen2-7B-Instruct/" -e OUTPUT_DIR="./output" -e TOKEN_PATH="/Qwen2-7B-Instruct/" -e https_proxy=$https_proxy -e http_proxy=$http_proxy --ipc=host opea/llm-vllm-xft:latest +``` + +# 🚀3. Consume LLM Service + +## 3.1 Check Service Status + +```bash +curl http://${your_ip}:9000/v1/health_check\ + -X GET \ + -H 'Content-Type: application/json' +``` + +## 3.2 Consume LLM Service + +You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`. + +The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`. + +```bash +# non-streaming mode +curl http://${your_ip}:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -H 'Content-Type: application/json' + +# streaming mode +curl http://${your_ip}:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/llms/text-generation/vllm-xft/docker/Dockerfile b/comps/llms/text-generation/vllm-xft/docker/Dockerfile new file mode 100644 index 000000000..db682e04f --- /dev/null +++ b/comps/llms/text-generation/vllm-xft/docker/Dockerfile @@ -0,0 +1,98 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM ubuntu:22.04 + +ARG TAG=main + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + gcc-12 \ + g++-12 \ + make \ + wget \ + libnuma-dev \ + numactl \ + git \ + pkg-config \ + software-properties-common \ + zlib1g-dev \ + libssl-dev \ + libffi-dev \ + libbz2-dev \ + libsqlite3-dev \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 60 \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* + +# Install python +WORKDIR /tmp +RUN wget -q https://www.python.org/ftp/python/3.8.10/Python-3.8.10.tgz \ + && tar -xzvf Python-3.8.10.tgz +WORKDIR /tmp/Python-3.8.10 +RUN ./configure --prefix=/usr/bin/python3.8 --enable-optimizations \ + && make -j \ + && make install \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.8/bin/python3.8 60 \ + && update-alternatives --install /usr/bin/pip pip /usr/bin/python3.8/bin/pip3 60 \ + && python -m pip install --no-cache-dir --upgrade pip setuptools \ + && pip install --no-cache-dir wheel \ + && rm -rf /tmp/* \ + && echo "export PATH=/usr/bin/python3.8:\$PATH" >> ~/.bashrc + +RUN pip install --no-cache-dir torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir cmake==3.26.1 transformers==4.41.2 sentencepiece==0.1.99 accelerate==0.23.0 protobuf tiktoken transformers-stream-generator einops \ + && ln -s /usr/bin/python3.8/lib/python3.8/site-packages/cmake/data/bin/cmake /usr/bin/cmake + +# Install oneCCL +RUN git clone https://github.com/oneapi-src/oneCCL.git /tmp/oneCCL +WORKDIR /tmp/oneCCL +RUN git checkout 2021.10 \ + && sed -i 's/cpu_gpu_dpcpp/./g' cmake/templates/oneCCLConfig.cmake.in \ + && mkdir build +WORKDIR /tmp/oneCCL/build +RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local/oneCCL \ + && make -j install + +RUN echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bashrc + +WORKDIR /root/ +RUN rm -rf /tmp/oneCCL + +RUN git clone https://github.com/intel/xFasterTransformer.git + +SHELL ["/bin/bash", "-c"] +WORKDIR /root/xFasterTransformer +RUN git checkout ${TAG} \ + && export "LD_LIBRARY_PATH=/usr/local/mklml_lnx_2019.0.5.20190502/lib:$LD_LIBRARY_PATH" \ + && export "PATH=/usr/bin/python3.8:$PATH" \ + && echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bash_profile \ + && source ~/.bash_profile \ + && python setup.py build \ + && python setup.py egg_info bdist_wheel --verbose \ + && pip install --no-cache-dir dist/* + +RUN mkdir -p /usr/local/xft/lib \ + && cp /root/xFasterTransformer/build/libxfastertransformer.so /usr/local/xft/lib \ + && cp /root/xFasterTransformer/build/libxft_comm_helper.so /usr/local/xft/lib \ + && cp -r /root/xFasterTransformer/include /usr/local/xft/ \ + && mkdir -p /usr/local/include/xft/ \ + && ln -s /usr/local/xft/include /usr/local/include/xft/include + +RUN echo "export \$(python -c 'import xfastertransformer as xft; print(xft.get_env())')" >> ~/.bashrc + +COPY comps /root/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /root/comps/llms/text-generation/vllm-xft/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/root + +RUN chmod +x /root/comps/llms/text-generation/vllm-xft/run.sh + +WORKDIR /root/comps/llms/text-generation/vllm-xft/ + +ENTRYPOINT ["/root/comps/llms/text-generation/vllm-xft/run.sh"] + diff --git a/comps/llms/text-generation/vllm-xft/llm.py b/comps/llms/text-generation/vllm-xft/llm.py new file mode 100644 index 000000000..02446baa6 --- /dev/null +++ b/comps/llms/text-generation/vllm-xft/llm.py @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from fastapi.responses import StreamingResponse +from langchain_community.llms import VLLMOpenAI +from langsmith import traceable + +from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice + + +@register_microservice( + name="opea_service@llm_vllm_xft", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=9000, +) +@traceable(run_type="llm") +def llm_generate(input: LLMParamsDoc): + llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:18688") + llm = VLLMOpenAI( + openai_api_key="EMPTY", + openai_api_base=llm_endpoint + "/v1", + max_tokens=input.max_new_tokens, + model_name="xft", + top_p=input.top_p, + temperature=input.temperature, + presence_penalty=input.repetition_penalty, + streaming=input.streaming, + ) + + if input.streaming: + + def stream_generator(): + chat_response = "" + for text in llm.stream(input.query): + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + print(f"[llm - chat_stream] chunk:{chunk_repr}") + yield f"data: {chunk_repr}\n\n" + print(f"[llm - chat_stream] stream response: {chat_response}") + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = llm.invoke(input.query) + return GeneratedDoc(text=response, prompt=input.query) + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_vllm_xft"].start() diff --git a/comps/llms/text-generation/vllm-xft/requirements.txt b/comps/llms/text-generation/vllm-xft/requirements.txt new file mode 100644 index 000000000..1035c67a0 --- /dev/null +++ b/comps/llms/text-generation/vllm-xft/requirements.txt @@ -0,0 +1,9 @@ +docarray[full] +fastapi +langchain==0.1.16 +langsmith +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +shortuuid +vllm-xft diff --git a/comps/llms/text-generation/vllm-xft/run.sh b/comps/llms/text-generation/vllm-xft/run.sh new file mode 100644 index 000000000..4baf95065 --- /dev/null +++ b/comps/llms/text-generation/vllm-xft/run.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# convert the model to fastertransformer format +python -c 'import os; import xfastertransformer as xft; xft.Qwen2Convert().convert(os.environ["HF_DATASET_DIR"], os.environ["OUTPUT_DIR"])' + +unset http_proxy + +# serving with vllm +python -m vllm.entrypoints.openai.api_server \ + --model ${OUTPUT_DIR} \ + --tokenizer ${TOKEN_PATH} \ + --dtype bf16 \ + --kv-cache-dtype fp16 \ + --served-model-name xft \ + --host localhost \ + --port 18688 \ + --trust-remote-code & + +# run llm microservice wrapper +python llm.py