-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathDockerfile.bundled_model
31 lines (22 loc) · 1.09 KB
/
Dockerfile.bundled_model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# This Dockerfile downloads the model during buildtime instead of runtime. It
# is included primarily for demonstration purposes.
# Note: Due to image size constraints, this can *ONLY* be used for very small
# models. Using large models can take a very long time when compressing
# and exporting container layers during the build process. If the
# resulting image is too large (>10GB), the deployment will fail when it
# attempts to download the image from the registry.
FROM vllm/vllm-openai:latest
ARG MODEL_NAME=
ARG REVISION="main"
ENV HF_HOME=/workspace/model-cache
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV MODEL_NAME=$MODEL_NAME
COPY ./model_downloader.sh /workspace/model_downloader.sh
RUN pip install --no-cache-dir huggingface_hub[cli,hf_transfer]
RUN --mount=type=secret,id=hf_token \
HF_TOKEN=$(cat /run/secrets/hf_token) bash /workspace/model_downloader.sh
ENV HF_HUB_OFFLINE=1
ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server \
--model $MODEL_NAME \
--gpu-memory-utilization ${GPU_MEMORY_LIMIT:-0.95} \
${MAX_MODEL_LEN:+--max-model-len "$MAX_MODEL_LEN"}