From 8ebff395fa670dd328b4e7760529c274f400baf3 Mon Sep 17 00:00:00 2001
From: Letong Han <106566639+letonghan@users.noreply.github.com>
Date: Wed, 31 Jan 2024 10:03:51 +0800
Subject: [PATCH] [NeuralChat] Support Neuralchat-TGI serving with Docker
 (#1208)

Signed-off-by: LetongHan <letong.han@intel.com>
Co-authored-by: VincyZhang <wenxin.zhang@intel.com>
---
 .../docker/tgi_serving/Dockerfile_tgi         | 97 +++++++++++++++++++
 .../neural_chat/docker/tgi_serving/README.md  | 38 ++++++++
 .../neural_chat/docker/tgi_serving/tgi.yaml   | 33 +++++++
 .../examples/serving/TGI/README.md            | 16 ++-
 .../neural_chat/examples/serving/TGI/tgi.yaml |  1 -
 .../neural_chat/server/neuralchat_server.py   |  3 +-
 .../neural_chat/server/restful/tgi_api.py     | 14 +++
 7 files changed, 198 insertions(+), 4 deletions(-)
 create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi
 create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md
 create mode 100644 intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml

diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi
new file mode 100644
index 00000000000..f67278779d7
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/Dockerfile_tgi
@@ -0,0 +1,97 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+#
+# ============================================================================
+
+
+## SPR environment
+ARG UBUNTU_VER=22.04
+FROM ubuntu:${UBUNTU_VER} as cpu
+
+ARG ITREX_VER=main
+ARG PYTHON_VERSION=3.10
+ARG REPO=https://github.com/intel/intel-extension-for-transformers.git
+ARG REPO_PATH=""
+ARG SSHD_PORT=22
+ENV SSHD_PORT ${SSHD_PORT}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Install system dependencies
+SHELL ["/bin/bash", "--login", "-c"]
+RUN apt update \
+    && apt install -y build-essential wget numactl git openssh-server libgl1-mesa-glx libjemalloc2 google-perftools \
+    && apt install -y python${PYTHON_VERSION} python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# install Docker Client and dependencies
+RUN apt-get update && apt-get install -y \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    gnupg-agent \
+    software-properties-common
+
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
+
+RUN add-apt-repository \
+   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+   $(lsb_release -cs) \
+   stable"
+
+RUN apt-get update && apt-get install -y docker-ce-cli
+
+# Download ITREX code
+RUN mkdir -p /intel-extension-for-transformers
+COPY ${REPO_PATH} /intel-extension-for-transformers
+RUN if [ "$REPO_PATH" == "" ]; then rm -rf intel-extension-for-transformers/* && rm -rf intel-extension-for-transformers/.* ; git clone --single-branch --branch=${ITREX_VER} ${REPO} intel-extension-for-transformers ; fi
+WORKDIR /intel-extension-for-transformers
+
+RUN pip install oneccl_bind_pt --extra-index-url  https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    cd /intel-extension-for-transformers && pip install -r requirements.txt && \
+    pip install -v . && \
+    cd ./intel_extension_for_transformers/neural_chat/examples/finetuning/instruction && pip install -r requirements.txt && \
+    cd /intel-extension-for-transformers/intel_extension_for_transformers/neural_chat && pip install -r requirements_cpu.txt && \
+    pip install astunparse ninja pyyaml mkl mkl-include setuptools cmake cffi future six requests dataclasses && \
+    pip install typing_extensions datasets accelerate SentencePiece evaluate nltk rouge_score protobuf==3.20.1 tokenizers einops peft
+
+# Enable passwordless ssh for mpirun
+RUN mkdir /var/run/sshd
+RUN passwd -d root
+RUN sed -i'' -e's/^#PermitRootLogin prohibit-password$/PermitRootLogin yes/' /etc/ssh/sshd_config \
+        && sed -i'' -e's/^#PasswordAuthentication yes$/PasswordAuthentication yes/' /etc/ssh/sshd_config \
+        && sed -i'' -e's/^#PermitEmptyPasswords no$/PermitEmptyPasswords yes/' /etc/ssh/sshd_config \
+        && sed -i'' -e's/^UsePAM yes/UsePAM no/' /etc/ssh/sshd_config \
+        && echo "Port "$SSHD_PORT"" >> /etc/ssh/sshd_config \
+        && echo "Host *" >> /etc/ssh/ssh_config \
+        && echo "  Port "$SSHD_PORT"" >> /etc/ssh/ssh_config \
+        && echo "  StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+EXPOSE ${SSHD_PORT}
+
+
+ENTRYPOINT ["neuralchat_server"]
+CMD ["start", "--config_file", "/tgi.yaml"]
+
diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md
new file mode 100644
index 00000000000..331534d2eac
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/README.md
@@ -0,0 +1,38 @@
+Intel Neural Chat Inference Dockerfile installer for Ubuntu22.04
+
+# Start NeuralChat and TGI serving with Docker
+
+## Environment Setup
+
+### Setup Xeon SPR Environment
+Use Dockerfile_tgi to build Docker image in your environment.
+```bash
+docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest
+```
+If you need to set proxy settings, add `--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy` like below.
+```bash
+docker build . -f Dockerfile_tgi -t neuralchat_tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+```  
+
+### Start NeuralChat Service
+Before starting NeuralChat services, you need to configure `tgi.yaml` according to you read environment.
+Make sure the specified `port` is available, `device` is `cpu` (`auto` will not work).
+Other detailed parameters please refer to `intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md`
+
+```bash
+docker run -it --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./tgi.yaml:/tgi.yaml neuralchat_tgi:latest
+```
+
+
+## Consume the Service
+when `docker run` command is successfully executed, you can consume the HTTP services offered by NeuralChat.
+
+Here is an example of consuming TGI service, remember to substitute your real ip and port.
+```bash
+curl ${your_ip}:${your_port}/v1/tgi/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+```
+
+
diff --git a/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml
new file mode 100644
index 00000000000..dbe70a3ce2b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docker/tgi_serving/tgi.yaml
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the parameter configuration file for NeuralChat Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8000
+
+model_name_or_path: "Intel/neural-chat-7b-v3-1"
+device: "cpu"
+
+serving:
+    framework: "tgi"
+
+
+tasks_list: ['textchat', 'tgi']
diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md
index 6276f3d3588..74b4ecd80ac 100644
--- a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md
+++ b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md
@@ -63,7 +63,6 @@ You can customize the configuration file 'tgi.yaml' to match your environment se
 | model_name_or_path                | "./neural-chat-7b-v3-1"                 |
 | device                            | "cpu"/"gpu"/"hpu"                                 |
 | serving.framework                  | "tgi"                                   |
-| serving.framework.tgi_engine_params.model_id    | "mistralai/Mistral-7B-Instruct-v0.1"                               |
 | serving.framework.tgi_engine_params.sharded        | true (false only on cpu)                    |
 | serving.framework.tgi_engine_params.num_shard  | 4 (not effective when sharded is false)    |
 | serving.framework.tgi_engine_params.habana_visible_devices      | "0,1" (only on hpu)        |
@@ -76,3 +75,18 @@ To start the NeuralChat server with TGI framework, run the following command:
 ```shell
 nohup bash run.sh &
 ```
+
+
+# Consume the Services
+After the services are successfully launched, you can consume the HTTP services offered by NeuralChat.
+
+Here is an example of consuming TGI service, remember to substitute your real ip and port.
+
+```bash
+curl ${your_ip}:${your_port}/v1/tgi/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+```
+
+Of course, you can also consume the service via `postman`, `http request`, or other ways.
\ No newline at end of file
diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml
index 9906f97f881..890a2f42cb0 100644
--- a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml
+++ b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml
@@ -29,7 +29,6 @@ device: "auto"
 serving:
     framework: "tgi"
     tgi_engine_params:
-        model_id: "mistralai/Mistral-7B-Instruct-v0.1"
         # not supported on CPU
         sharded: true
         num_shard: 4
diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
index 941f88b3a84..8c46ebea3aa 100644
--- a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
+++ b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
@@ -132,7 +132,6 @@ def init(self, config):
             # TGI serving
             elif serving_framework == "tgi":
                 tgi_params = serving.get("tgi_engine_params", None)
-                tgi_model_id = tgi_params.get('model_id', "mistralai/Mistral-7B-Instruct-v0.1")
                 tgi_sharded = tgi_params.get('sharded', False)
                 tgi_num_shard = tgi_params.get('num_shard', 1)
                 tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
@@ -167,7 +166,7 @@ def init(self, config):
                 else:
                     logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
                     raise Exception("Please specify device for tgi.")
-                tgi_cmd += f" --model-id {tgi_model_id}"
+                tgi_cmd += f" --model-id {model_name_or_path}"
                 if tgi_sharded and tgi_num_shard > 1:
                     tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
                 # start tgi service
diff --git a/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py b/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py
index be7517d6e59..134862f8299 100644
--- a/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py
+++ b/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py
@@ -29,6 +29,20 @@ class TextGenerationAPIRouter(APIRouter):
     def __init__(self) -> None:
         super().__init__()
         self.endpoint = "http://0.0.0.0:9876/"
+        self.chatbot = None
+
+    def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None:
+        self.chatbot = chatbot
+        self.use_deepspeed = use_deepspeed
+        self.world_size = world_size
+        self.host = host
+        self.port = port
+
+    def get_chatbot(self):
+        if self.chatbot is None:
+            logger.error("Chatbot instance is not found.")
+            raise RuntimeError("Chatbot instance has not been set.")
+        return self.chatbot
 
     def handle_tgi_request(self, prompt, parameters, stream=False):
         client = InferenceClient(model=self.endpoint)