From 2c7d0a5b8b33d9a90ede19a0ee227393982ac340 Mon Sep 17 00:00:00 2001
From: Theresa Barton <tbarton16@gmail.com>
Date: Wed, 2 Oct 2024 10:12:07 -0700
Subject: [PATCH] [Fix] Fix all the Huggingface paths (#1553)

---
 README.md                                        |  4 ++--
 benchmark/benchmark_vllm_060/README.md           | 16 ++++++++--------
 benchmark/blog_v0_2/README.md                    |  4 ++--
 docker/compose.yaml                              |  2 +-
 docker/k8s-sglang-service.yaml                   |  2 +-
 docs/en/benchmark_and_profiling.md               |  2 +-
 docs/en/install.md                               |  4 ++--
 .../runtime/openai_chat_with_response_prefill.py |  4 ++--
 python/sglang/test/test_utils.py                 |  6 +++---
 test/srt/models/test_generation_models.py        |  2 +-
 test/srt/test_openai_server.py                   |  2 +-
 11 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 2651d09432..65ac1350f8 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ docker run --gpus all \
     --env "HF_TOKEN=<secret>" \
     --ipc=host \
     lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
 ### Method 4: Using docker compose
@@ -121,7 +121,7 @@ resources:
 run: |
   conda deactivate
   python3 -m sglang.launch_server \
-    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
     --host 0.0.0.0 \
     --port 30000
 ```
diff --git a/benchmark/benchmark_vllm_060/README.md b/benchmark/benchmark_vllm_060/README.md
index 5a1247c5f4..b480dabf23 100644
--- a/benchmark/benchmark_vllm_060/README.md
+++ b/benchmark/benchmark_vllm_060/README.md
@@ -58,12 +58,12 @@ We referred to the reproduction method in https://github.com/vllm-project/vllm/i
 
 ```bash
 # Llama 3.1 8B Instruct on 1 x A100
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
 
 # Llama 3.1 70B Instruct on 4 x H100
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
 
 # bench serving
 python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
@@ -76,12 +76,12 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro
 
 ```bash
 # Llama 3.1 8B Instruct on 1 x A100
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
 
 # Llama 3.1 70B Instruct on 4 x H100
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
 
 # bench serving
 python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
diff --git a/benchmark/blog_v0_2/README.md b/benchmark/blog_v0_2/README.md
index 57443e5fe2..7448554ee6 100644
--- a/benchmark/blog_v0_2/README.md
+++ b/benchmark/blog_v0_2/README.md
@@ -27,10 +27,10 @@ export HF_TOKEN=hf_token
 
 ```bash
 # Meta-Llama-3.1-8B-Instruct
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
 
 # Meta-Llama-3.1-70B-Instruct
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
 
 # Meta-Llama-3-70B-Instruct-FP8
 python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
diff --git a/docker/compose.yaml b/docker/compose.yaml
index 1480146222..c49d5c5bba 100644
--- a/docker/compose.yaml
+++ b/docker/compose.yaml
@@ -17,7 +17,7 @@ services:
       # - SGLANG_USE_MODELSCOPE: true
     entrypoint: python3 -m sglang.launch_server
     command:
-      --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
+      --model-path meta-llama/Llama-3.1-8B-Instruct
       --host 0.0.0.0
       --port 30000
     ulimits:
diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml
index c217f356af..cbccb14217 100644
--- a/docker/k8s-sglang-service.yaml
+++ b/docker/k8s-sglang-service.yaml
@@ -32,7 +32,7 @@ spec:
           ports:
             - containerPort: 30000
           command: ["python3", "-m", "sglang.launch_server"]
-          args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
+          args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
           env:
             - name: HF_TOKEN
               value: <secret>
diff --git a/docs/en/benchmark_and_profiling.md b/docs/en/benchmark_and_profiling.md
index 3fbd935891..77fbbfc1b6 100644
--- a/docs/en/benchmark_and_profiling.md
+++ b/docs/en/benchmark_and_profiling.md
@@ -30,7 +30,7 @@ apt install nsight-systems-cli
 ```bash
 # server
 # set the delay and duration times according to needs
-nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache
+nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
 
 # client
 python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048
diff --git a/docs/en/install.md b/docs/en/install.md
index c9dc1d70ae..55eed71ae7 100644
--- a/docs/en/install.md
+++ b/docs/en/install.md
@@ -35,7 +35,7 @@ docker run --gpus all \
     --env "HF_TOKEN=<secret>" \
     --ipc=host \
     lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
 ### Method 4: Using docker compose
@@ -75,7 +75,7 @@ resources:
 run: |
   conda deactivate
   python3 -m sglang.launch_server \
-    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
     --host 0.0.0.0 \
     --port 30000
 ```
diff --git a/examples/runtime/openai_chat_with_response_prefill.py b/examples/runtime/openai_chat_with_response_prefill.py
index a856019b51..1b1604b302 100644
--- a/examples/runtime/openai_chat_with_response_prefill.py
+++ b/examples/runtime/openai_chat_with_response_prefill.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
 python openai_chat.py
 """
 
@@ -10,7 +10,7 @@
 client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
 
 response = client.chat.completions.create(
-    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    model="meta-llama/Llama-3.1-8B-Instruct",
     messages=[
         {"role": "system", "content": "You are a helpful AI assistant"},
         {
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index fbe45bb2ff..2c22f8d901 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -23,13 +23,13 @@
 from sglang.utils import get_exception_traceback
 
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
-DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py
index 21078e8aaa..7be410ccb0 100644
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -44,7 +44,7 @@ class ModelCase:
 
 # Popular models that run on CI
 CI_MODELS = [
-    ModelCase("meta-llama/Meta-Llama-3.1-8B-Instruct"),
+    ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
     ModelCase("google/gemma-2-2b"),
 ]
 
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index d92a9de96b..5afe9b0b17 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -499,7 +499,7 @@ def test_response_prefill(self):
         client = openai.Client(api_key=self.api_key, base_url=self.base_url)
 
         response = client.chat.completions.create(
-            model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+            model="meta-llama/Llama-3.1-8B-Instruct",
             messages=[
                 {"role": "system", "content": "You are a helpful AI assistant"},
                 {