Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
[NeuralChat] Support Neuralchat-TGI serving with Docker (#1208)
Browse files Browse the repository at this point in the history
Signed-off-by: LetongHan <letong.han@intel.com>
Co-authored-by: VincyZhang <wenxin.zhang@intel.com>
  • Loading branch information
letonghan and VincyZhang authored Jan 31, 2024
1 parent ac0ea1e commit 8ebff39
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
#
# THIS IS A GENERATED DOCKERFILE.
#
# This file was assembled from multiple pieces, whose use is documented
# throughout. Please refer to the TensorFlow dockerfiles documentation
# for more information.
#
# ============================================================================


## SPR environment
ARG UBUNTU_VER=22.04
FROM ubuntu:${UBUNTU_VER} as cpu

ARG ITREX_VER=main
ARG PYTHON_VERSION=3.10
ARG REPO=https://github.com/intel/intel-extension-for-transformers.git
ARG REPO_PATH=""
ARG SSHD_PORT=22
ENV SSHD_PORT ${SSHD_PORT}

# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8

# Install system dependencies
SHELL ["/bin/bash", "--login", "-c"]
RUN apt update \
&& apt install -y build-essential wget numactl git openssh-server libgl1-mesa-glx libjemalloc2 google-perftools \
&& apt install -y python${PYTHON_VERSION} python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN ln -s /usr/bin/python3 /usr/bin/python

# install Docker Client and dependencies
RUN apt-get update && apt-get install -y \
apt-transport-https \
ca-certificates \
curl \
gnupg-agent \
software-properties-common

RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -

RUN add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"

RUN apt-get update && apt-get install -y docker-ce-cli

# Download ITREX code
RUN mkdir -p /intel-extension-for-transformers
COPY ${REPO_PATH} /intel-extension-for-transformers
RUN if [ "$REPO_PATH" == "" ]; then rm -rf intel-extension-for-transformers/* && rm -rf intel-extension-for-transformers/.* ; git clone --single-branch --branch=${ITREX_VER} ${REPO} intel-extension-for-transformers ; fi
WORKDIR /intel-extension-for-transformers

RUN pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
cd /intel-extension-for-transformers && pip install -r requirements.txt && \
pip install -v . && \
cd ./intel_extension_for_transformers/neural_chat/examples/finetuning/instruction && pip install -r requirements.txt && \
cd /intel-extension-for-transformers/intel_extension_for_transformers/neural_chat && pip install -r requirements_cpu.txt && \
pip install astunparse ninja pyyaml mkl mkl-include setuptools cmake cffi future six requests dataclasses && \
pip install typing_extensions datasets accelerate SentencePiece evaluate nltk rouge_score protobuf==3.20.1 tokenizers einops peft

# Enable passwordless ssh for mpirun
RUN mkdir /var/run/sshd
RUN passwd -d root
RUN sed -i'' -e's/^#PermitRootLogin prohibit-password$/PermitRootLogin yes/' /etc/ssh/sshd_config \
&& sed -i'' -e's/^#PasswordAuthentication yes$/PasswordAuthentication yes/' /etc/ssh/sshd_config \
&& sed -i'' -e's/^#PermitEmptyPasswords no$/PermitEmptyPasswords yes/' /etc/ssh/sshd_config \
&& sed -i'' -e's/^UsePAM yes/UsePAM no/' /etc/ssh/sshd_config \
&& echo "Port "$SSHD_PORT"" >> /etc/ssh/sshd_config \
&& echo "Host *" >> /etc/ssh/ssh_config \
&& echo " Port "$SSHD_PORT"" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config
EXPOSE ${SSHD_PORT}


ENTRYPOINT ["neuralchat_server"]
CMD ["start", "--config_file", "/tgi.yaml"]

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Intel Neural Chat Inference Dockerfile installer for Ubuntu22.04

# Start NeuralChat and TGI serving with Docker

## Environment Setup

### Setup Xeon SPR Environment
Use Dockerfile_tgi to build Docker image in your environment.
```bash
docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest
```
If you need to set proxy settings, add `--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy` like below.
```bash
docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
```

### Start NeuralChat Service
Before starting NeuralChat services, you need to configure `tgi.yaml` according to you read environment.
Make sure the specified `port` is available, `device` is `cpu` (`auto` will not work).
Other detailed parameters please refer to `intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md`

```bash
docker run -it --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./tgi.yaml:/tgi.yaml neuralchat_tgi:latest
```


## Consume the Service
when `docker run` command is successfully executed, you can consume the HTTP services offered by NeuralChat.

Here is an example of consuming TGI service, remember to substitute your real ip and port.
```bash
curl ${your_ip}:${your_port}/v1/tgi/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```


Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This is the parameter configuration file for NeuralChat Serving.

#################################################################################
# SERVER SETTING #
#################################################################################
host: 0.0.0.0
port: 8000

model_name_or_path: "Intel/neural-chat-7b-v3-1"
device: "cpu"

serving:
framework: "tgi"


tasks_list: ['textchat', 'tgi']
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ You can customize the configuration file 'tgi.yaml' to match your environment se
| model_name_or_path | "./neural-chat-7b-v3-1" |
| device | "cpu"/"gpu"/"hpu" |
| serving.framework | "tgi" |
| serving.framework.tgi_engine_params.model_id | "mistralai/Mistral-7B-Instruct-v0.1" |
| serving.framework.tgi_engine_params.sharded | true (false only on cpu) |
| serving.framework.tgi_engine_params.num_shard | 4 (not effective when sharded is false) |
| serving.framework.tgi_engine_params.habana_visible_devices | "0,1" (only on hpu) |
Expand All @@ -76,3 +75,18 @@ To start the NeuralChat server with TGI framework, run the following command:
```shell
nohup bash run.sh &
```


# Consume the Services
After the services are successfully launched, you can consume the HTTP services offered by NeuralChat.

Here is an example of consuming TGI service, remember to substitute your real ip and port.

```bash
curl ${your_ip}:${your_port}/v1/tgi/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```

Of course, you can also consume the service via `postman`, `http request`, or other ways.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ device: "auto"
serving:
framework: "tgi"
tgi_engine_params:
model_id: "mistralai/Mistral-7B-Instruct-v0.1"
# not supported on CPU
sharded: true
num_shard: 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def init(self, config):
# TGI serving
elif serving_framework == "tgi":
tgi_params = serving.get("tgi_engine_params", None)
tgi_model_id = tgi_params.get('model_id', "mistralai/Mistral-7B-Instruct-v0.1")
tgi_sharded = tgi_params.get('sharded', False)
tgi_num_shard = tgi_params.get('num_shard', 1)
tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
Expand Down Expand Up @@ -167,7 +166,7 @@ def init(self, config):
else:
logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
raise Exception("Please specify device for tgi.")
tgi_cmd += f" --model-id {tgi_model_id}"
tgi_cmd += f" --model-id {model_name_or_path}"
if tgi_sharded and tgi_num_shard > 1:
tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
# start tgi service
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ class TextGenerationAPIRouter(APIRouter):
def __init__(self) -> None:
super().__init__()
self.endpoint = "http://0.0.0.0:9876/"
self.chatbot = None

def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None:
self.chatbot = chatbot
self.use_deepspeed = use_deepspeed
self.world_size = world_size
self.host = host
self.port = port

def get_chatbot(self):
if self.chatbot is None:
logger.error("Chatbot instance is not found.")
raise RuntimeError("Chatbot instance has not been set.")
return self.chatbot

def handle_tgi_request(self, prompt, parameters, stream=False):
client = InferenceClient(model=self.endpoint)
Expand Down

0 comments on commit 8ebff39

Please sign in to comment.