Dockerfile.bundled_model

# This Dockerfile downloads the model during buildtime instead of runtime. It
# is included primarily for demonstration purposes.

# Note: Due to image size constraints, this can *ONLY* be used for very small
#       models. Using large models can take a very long time when compressing
#       and exporting container layers during the build process. If the
#       resulting image is too large (>10GB), the deployment will fail when it
#       attempts to download the image from the registry.

FROM vllm/vllm-openai:latest

ARG MODEL_NAME=
ARG REVISION="main"

ENV HF_HOME=/workspace/model-cache
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV MODEL_NAME=$MODEL_NAME

COPY ./model_downloader.sh /workspace/model_downloader.sh

RUN pip install --no-cache-dir huggingface_hub[cli,hf_transfer]

RUN --mount=type=secret,id=hf_token \
    HF_TOKEN=$(cat /run/secrets/hf_token) bash /workspace/model_downloader.sh

ENV HF_HUB_OFFLINE=1

ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server \
    --model $MODEL_NAME \
    --gpu-memory-utilization ${GPU_MEMORY_LIMIT:-0.95} \
    ${MAX_MODEL_LEN:+--max-model-len "$MAX_MODEL_LEN"}