From 8ebff395fa670dd328b4e7760529c274f400baf3 Mon Sep 17 00:00:00 2001 From: Letong Han <106566639+letonghan@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:03:51 +0800 Subject: [PATCH] [NeuralChat] Support Neuralchat-TGI serving with Docker (#1208) Signed-off-by: LetongHan Co-authored-by: VincyZhang --- .../docker/tgi_serving/Dockerfile_tgi | 97 +++++++++++++++++++ .../neural_chat/docker/tgi_serving/README.md | 38 ++++++++ .../neural_chat/docker/tgi_serving/tgi.yaml | 33 +++++++ .../examples/serving/TGI/README.md | 16 ++- .../neural_chat/examples/serving/TGI/tgi.yaml | 1 - .../neural_chat/server/neuralchat_server.py | 3 +- .../neural_chat/server/restful/tgi_api.py | 14 +++ 7 files changed, 198 insertions(+), 4 deletions(-) create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi new file mode 100644 index 00000000000..f67278779d7 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi @@ -0,0 +1,97 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# +# THIS IS A GENERATED DOCKERFILE. +# +# This file was assembled from multiple pieces, whose use is documented +# throughout. Please refer to the TensorFlow dockerfiles documentation +# for more information. +# +# ============================================================================ + + +## SPR environment +ARG UBUNTU_VER=22.04 +FROM ubuntu:${UBUNTU_VER} as cpu + +ARG ITREX_VER=main +ARG PYTHON_VERSION=3.10 +ARG REPO=https://github.com/intel/intel-extension-for-transformers.git +ARG REPO_PATH="" +ARG SSHD_PORT=22 +ENV SSHD_PORT ${SSHD_PORT} + +# See http://bugs.python.org/issue19846 +ENV LANG C.UTF-8 + +# Install system dependencies +SHELL ["/bin/bash", "--login", "-c"] +RUN apt update \ + && apt install -y build-essential wget numactl git openssh-server libgl1-mesa-glx libjemalloc2 google-perftools \ + && apt install -y python${PYTHON_VERSION} python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +# install Docker Client and dependencies +RUN apt-get update && apt-get install -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg-agent \ + software-properties-common + +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - + +RUN add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable" + +RUN apt-get update && apt-get install -y docker-ce-cli + +# Download ITREX code +RUN mkdir -p /intel-extension-for-transformers +COPY ${REPO_PATH} /intel-extension-for-transformers +RUN if [ "$REPO_PATH" == "" ]; then rm -rf intel-extension-for-transformers/* && rm -rf intel-extension-for-transformers/.* ; git clone --single-branch --branch=${ITREX_VER} ${REPO} intel-extension-for-transformers ; fi +WORKDIR /intel-extension-for-transformers + +RUN pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \ + cd /intel-extension-for-transformers && pip install -r requirements.txt && \ + pip install -v . && \ + cd ./intel_extension_for_transformers/neural_chat/examples/finetuning/instruction && pip install -r requirements.txt && \ + cd /intel-extension-for-transformers/intel_extension_for_transformers/neural_chat && pip install -r requirements_cpu.txt && \ + pip install astunparse ninja pyyaml mkl mkl-include setuptools cmake cffi future six requests dataclasses && \ + pip install typing_extensions datasets accelerate SentencePiece evaluate nltk rouge_score protobuf==3.20.1 tokenizers einops peft + +# Enable passwordless ssh for mpirun +RUN mkdir /var/run/sshd +RUN passwd -d root +RUN sed -i'' -e's/^#PermitRootLogin prohibit-password$/PermitRootLogin yes/' /etc/ssh/sshd_config \ + && sed -i'' -e's/^#PasswordAuthentication yes$/PasswordAuthentication yes/' /etc/ssh/sshd_config \ + && sed -i'' -e's/^#PermitEmptyPasswords no$/PermitEmptyPasswords yes/' /etc/ssh/sshd_config \ + && sed -i'' -e's/^UsePAM yes/UsePAM no/' /etc/ssh/sshd_config \ + && echo "Port "$SSHD_PORT"" >> /etc/ssh/sshd_config \ + && echo "Host *" >> /etc/ssh/ssh_config \ + && echo " Port "$SSHD_PORT"" >> /etc/ssh/ssh_config \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config +EXPOSE ${SSHD_PORT} + + +ENTRYPOINT ["neuralchat_server"] +CMD ["start", "--config_file", "/tgi.yaml"] + diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md new file mode 100644 index 00000000000..331534d2eac --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md @@ -0,0 +1,38 @@ +Intel Neural Chat Inference Dockerfile installer for Ubuntu22.04 + +# Start NeuralChat and TGI serving with Docker + +## Environment Setup + +### Setup Xeon SPR Environment +Use Dockerfile_tgi to build Docker image in your environment. +```bash +docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest +``` +If you need to set proxy settings, add `--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy` like below. +```bash +docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +``` + +### Start NeuralChat Service +Before starting NeuralChat services, you need to configure `tgi.yaml` according to you read environment. +Make sure the specified `port` is available, `device` is `cpu` (`auto` will not work). +Other detailed parameters please refer to `intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md` + +```bash +docker run -it --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./tgi.yaml:/tgi.yaml neuralchat_tgi:latest +``` + + +## Consume the Service +when `docker run` command is successfully executed, you can consume the HTTP services offered by NeuralChat. + +Here is an example of consuming TGI service, remember to substitute your real ip and port. +```bash +curl ${your_ip}:${your_port}/v1/tgi/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' +``` + + diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml new file mode 100644 index 00000000000..dbe70a3ce2b --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is the parameter configuration file for NeuralChat Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8000 + +model_name_or_path: "Intel/neural-chat-7b-v3-1" +device: "cpu" + +serving: + framework: "tgi" + + +tasks_list: ['textchat', 'tgi'] diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md index 6276f3d3588..74b4ecd80ac 100644 --- a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md @@ -63,7 +63,6 @@ You can customize the configuration file 'tgi.yaml' to match your environment se | model_name_or_path | "./neural-chat-7b-v3-1" | | device | "cpu"/"gpu"/"hpu" | | serving.framework | "tgi" | -| serving.framework.tgi_engine_params.model_id | "mistralai/Mistral-7B-Instruct-v0.1" | | serving.framework.tgi_engine_params.sharded | true (false only on cpu) | | serving.framework.tgi_engine_params.num_shard | 4 (not effective when sharded is false) | | serving.framework.tgi_engine_params.habana_visible_devices | "0,1" (only on hpu) | @@ -76,3 +75,18 @@ To start the NeuralChat server with TGI framework, run the following command: ```shell nohup bash run.sh & ``` + + +# Consume the Services +After the services are successfully launched, you can consume the HTTP services offered by NeuralChat. + +Here is an example of consuming TGI service, remember to substitute your real ip and port. + +```bash +curl ${your_ip}:${your_port}/v1/tgi/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' +``` + +Of course, you can also consume the service via `postman`, `http request`, or other ways. \ No newline at end of file diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml index 9906f97f881..890a2f42cb0 100644 --- a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml +++ b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml @@ -29,7 +29,6 @@ device: "auto" serving: framework: "tgi" tgi_engine_params: - model_id: "mistralai/Mistral-7B-Instruct-v0.1" # not supported on CPU sharded: true num_shard: 4 diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py index 941f88b3a84..8c46ebea3aa 100644 --- a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py +++ b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py @@ -132,7 +132,6 @@ def init(self, config): # TGI serving elif serving_framework == "tgi": tgi_params = serving.get("tgi_engine_params", None) - tgi_model_id = tgi_params.get('model_id', "mistralai/Mistral-7B-Instruct-v0.1") tgi_sharded = tgi_params.get('sharded', False) tgi_num_shard = tgi_params.get('num_shard', 1) tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all") @@ -167,7 +166,7 @@ def init(self, config): else: logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}") raise Exception("Please specify device for tgi.") - tgi_cmd += f" --model-id {tgi_model_id}" + tgi_cmd += f" --model-id {model_name_or_path}" if tgi_sharded and tgi_num_shard > 1: tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}" # start tgi service diff --git a/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py b/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py index be7517d6e59..134862f8299 100644 --- a/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py +++ b/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py @@ -29,6 +29,20 @@ class TextGenerationAPIRouter(APIRouter): def __init__(self) -> None: super().__init__() self.endpoint = "http://0.0.0.0:9876/" + self.chatbot = None + + def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None: + self.chatbot = chatbot + self.use_deepspeed = use_deepspeed + self.world_size = world_size + self.host = host + self.port = port + + def get_chatbot(self): + if self.chatbot is None: + logger.error("Chatbot instance is not found.") + raise RuntimeError("Chatbot instance has not been set.") + return self.chatbot def handle_tgi_request(self, prompt, parameters, stream=False): client = InferenceClient(model=self.endpoint)