From 0c2841d71690d824cba9b311be0417dc53ba7d34 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Thu, 25 May 2023 11:42:00 +0800
Subject: [PATCH] chore: Add chatglm 6b (#32)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
---
 .github/workflows/docker-publish.yml |  2 +
 .github/workflows/gcr.yml            |  2 +
 README.md                            | 15 +++---
 images/chatglm-6b/Dockerfile         | 75 ++++++++++++++++++++++++++++
 main.py                              | 19 +++++--
 5 files changed, 100 insertions(+), 13 deletions(-)
 create mode 100644 images/chatglm-6b/Dockerfile

diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index b687586..18d98ca 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -25,6 +25,8 @@ jobs:
         image:
           - name: modelzai/llm-chatglm-6b-int4
             dockerfile: ./images/base/Dockerfile
+          - name: modelzai/llm-chatglm-6b
+            dockerfile: ./images/chatglm-6b/Dockerfile
           - name: modelzai/llm-llama-7b
             dockerfile: ./images/llama-7b/Dockerfile
           # - name: modelzai/llm-fastchat-t5-3b
diff --git a/.github/workflows/gcr.yml b/.github/workflows/gcr.yml
index d73d57e..c9f483c 100644
--- a/.github/workflows/gcr.yml
+++ b/.github/workflows/gcr.yml
@@ -25,6 +25,8 @@ jobs:
         image:
           - name: modelzai/llm-chatglm-6b-int4
             dockerfile: ./images/base/Dockerfile
+          - name: modelzai/llm-chatglm-6b
+            dockerfile: ./images/chatglm-6b/Dockerfile
           - name: modelzai/llm-llama-7b
             dockerfile: ./images/llama-7b/Dockerfile
           # - name: modelzai/llm-fastchat-t5-3b
diff --git a/README.md b/README.md
index 66ebd81..f53a7c4 100644
--- a/README.md
+++ b/README.md
@@ -33,17 +33,16 @@ python main.py
 
 Currently, we support the following models:
 
-| Model Name | Model (`MODELZ_MODEL`) | Tokenizer (`MODELZ_TOKENIZER`) |
-| ---------- | ------------ | ---------------- |
-| Vicuna 7B Delta V1.1    | `lmsys/vicuna-7b-delta-v1.1` | `lmsys/vicuna-7b-delta-v1.1` |
-| LLaMA 7B    | `decapoda-research/llama-7b-hf` | `decapoda-research/llama-7b-hf` |
-| ChatGLM 6B INT4    | `THUDM/chatglm-6b-int4` | `THUDM/chatglm-6b-int4` |
-| ChatGLM 6B    | `THUDM/chatglm-6b` | `THUDM/chatglm-6b` |
+| Model Name | Model (`MODELZ_MODEL`) | Docker Image |
+| ---------- | ----------- | ---------------- |
+| Vicuna 7B Delta V1.1  | `lmsys/vicuna-7b-delta-v1.1` | [modelzai/llm-vicuna-7b](https://hub.docker.com/repository/docker/modelzai/llm-vicuna-7b/general) |
+| LLaMA 7B    | `decapoda-research/llama-7b-hf` | [modelzai/llm-llama-7b](https://hub.docker.com/repository/docker/modelzai/llm-llama-7b/general) |
+| ChatGLM 6B INT4    | `THUDM/chatglm-6b-int4` | [modelzai/llm-chatglm-6b-int4](https://hub.docker.com/repository/docker/modelzai/llm-chatglm-6b-int4/general) |
+| ChatGLM 6B  | `THUDM/chatglm-6b` | [modelzai/llm-chatglm-6b](https://hub.docker.com/repository/docker/modelzai/llm-chatglm-6b/general) |
 
 <!-- | FastChat T5 3B V1.0  | `lmsys/fastchat-t5-3b-v1.0` | `lmsys/fastchat-t5-3b-v1.0` | -->
 
-
-You could set the `MODELZ_MODEL` and `MODELZ_TOKENIZER` environment variables to specify the model and tokenizer.
+You could set the `MODELZ_MODEL` environment variables to specify the model and tokenizer.
 
 ### Use OpenAI python SDK
 
diff --git a/images/chatglm-6b/Dockerfile b/images/chatglm-6b/Dockerfile
new file mode 100644
index 0000000..572d55b
--- /dev/null
+++ b/images/chatglm-6b/Dockerfile
@@ -0,0 +1,75 @@
+ARG base=nvidia/cuda:11.6.2-cudnn8-runtime-ubuntu20.04
+
+FROM ${base}
+
+ENV DEBIAN_FRONTEND=noninteractive LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8
+ENV PATH /opt/conda/bin:$PATH
+
+ENV MODELZ_MODEL=THUDM/chatglm-6b
+ENV MODELZ_TOKENIZER=THUDM/chatglm-6b
+
+ARG MOSEC_PORT=8080
+ENV MOSEC_PORT=${MOSEC_PORT}
+
+ARG CONDA_VERSION=py310_22.11.1-1
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+        wget \
+        build-essential \
+        ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN set -x && \
+    UNAME_M="$(uname -m)" && \
+    if [ "${UNAME_M}" = "x86_64" ]; then \
+        MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh"; \
+        SHA256SUM="00938c3534750a0e4069499baf8f4e6dc1c2e471c86a59caa0dd03f4a9269db6"; \
+    elif [ "${UNAME_M}" = "s390x" ]; then \
+        MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-s390x.sh"; \
+        SHA256SUM="a150511e7fd19d07b770f278fb5dd2df4bc24a8f55f06d6274774f209a36c766"; \
+    elif [ "${UNAME_M}" = "aarch64" ]; then \
+        MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-aarch64.sh"; \
+        SHA256SUM="48a96df9ff56f7421b6dd7f9f71d548023847ba918c3826059918c08326c2017"; \
+    elif [ "${UNAME_M}" = "ppc64le" ]; then \
+        MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-ppc64le.sh"; \
+        SHA256SUM="4c86c3383bb27b44f7059336c3a46c34922df42824577b93eadecefbf7423836"; \
+    fi && \
+    wget "${MINICONDA_URL}" -O miniconda.sh -q && \
+    echo "${SHA256SUM} miniconda.sh" > shasum && \
+    if [ "${CONDA_VERSION}" != "latest" ]; then sha256sum --check --status shasum; fi && \
+    mkdir -p /opt && \
+    bash miniconda.sh -b -p /opt/conda && \
+    rm miniconda.sh shasum && \
+    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    find /opt/conda/ -follow -type f -name '*.a' -delete && \
+    find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
+    /opt/conda/bin/conda clean -afy
+
+RUN conda create -n envd python=3.9
+
+ENV ENVD_PREFIX=/opt/conda/envs/envd/bin
+
+RUN update-alternatives --install /usr/bin/python python ${ENVD_PREFIX}/python 1 && \
+    update-alternatives --install /usr/bin/python3 python3 ${ENVD_PREFIX}/python3 1 && \
+    update-alternatives --install /usr/bin/pip pip ${ENVD_PREFIX}/pip 1 && \
+    update-alternatives --install /usr/bin/pip3 pip3 ${ENVD_PREFIX}/pip3 1
+
+COPY requirements.txt /
+
+RUN pip install -r requirements.txt
+
+RUN mkdir -p /workspace
+
+COPY main.py workspace/
+
+WORKDIR /workspace
+
+# RUN python main.py --dry-run
+
+# # disable huggingface update check (could be very slow)
+# ENV HF_HUB_OFFLINE=true
+
+ENTRYPOINT [ "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "1" ]
diff --git a/main.py b/main.py
index 504135c..059d6d7 100644
--- a/main.py
+++ b/main.py
@@ -38,11 +38,17 @@
 
 class LLM:
     def __init__(self, model_name: str, tokenizer_name: str) -> None:
+        # Use the same tokenizer as the model
+        if tokenizer_name is None:
+            tokenizer_name = model_name
+
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
             tokenizer_name, trust_remote_code=True
         )
-        model_cls = getattr(transformers, LanguageModels.transformer_cls(model_name))
-        self.model = model_cls.from_pretrained(model_name, trust_remote_code=True)
+        model_cls = getattr(
+            transformers, LanguageModels.transformer_cls(model_name))
+        self.model = model_cls.from_pretrained(
+            model_name, trust_remote_code=True)
         self.device = (
             torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
         )
@@ -90,7 +96,8 @@ async def on_post(self, req: Request, resp: Response):
             logger.info(f"Failed to parse request: {err}")
             # return 400 otherwise the client will retry
             resp.status = falcon.HTTP_400
-            resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json()
+            resp.data = ErrorResponse.from_validation_err(
+                err, str(buf)).to_json()
             return
 
         tokens = llm.encode(chat_req.get_prompt(self.model_name))
@@ -104,7 +111,8 @@ async def on_post(self, req: Request, resp: Response):
             model=self.model_name,
             created=datetime.now(),
             choices=[
-                ChatChoice(message=ChatMessage(content=msg, role=Role.ASSISTANT)),
+                ChatChoice(message=ChatMessage(
+                    content=msg, role=Role.ASSISTANT)),
             ],
             usage=TokenUsage(
                 prompt_tokens=input_length,
@@ -127,7 +135,8 @@ async def on_post(self, req: Request, resp: Response):
             logger.info(f"Failed to parse request: {err}")
             # return 400 otherwise the client will retry
             resp.status = falcon.HTTP_400
-            resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json()
+            resp.data = ErrorResponse.from_validation_err(
+                err, str(buf)).to_json()
             return
 
         tokens = llm.encode(prompt_req.get_prompt())