diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml
index 809120c0c18..7b77c63a54c 100644
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -27,14 +27,14 @@ jobs:
           bash scripts/ci_install_dependency.sh
           pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
 
-      - name: Test human eval
+      - name: Test gsm8k
         timeout-minutes: 120
         run: |
           cd test/srt
-          python3 test_nightly_human_eval.py
+          python3 test_nightly_gsm8k_eval.py
 
-      - name: Test gsm8k
+      - name: Test human eval
         timeout-minutes: 120
         run: |
           cd test/srt
-          python3 test_nightly_gsm8k_eval.py
+          python3 test_nightly_human_eval.py
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 5aba58d7a5a..d673d59ff54 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -439,18 +439,22 @@ def popen_launch_server(
         process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
 
     start_time = time.time()
-    while time.time() - start_time < timeout:
-        try:
-            headers = {
-                "Content-Type": "application/json; charset=utf-8",
-                "Authorization": f"Bearer {api_key}",
-            }
-            response = requests.get(f"{base_url}/health_generate", headers=headers)
-            if response.status_code == 200:
-                return process
-        except requests.RequestException:
-            pass
-        time.sleep(10)
+    with requests.Session() as session:
+        while time.time() - start_time < timeout:
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {api_key}",
+                }
+                response = session.get(
+                    f"{base_url}/health_generate",
+                    headers=headers,
+                )
+                if response.status_code == 200:
+                    return process
+            except requests.RequestException:
+                pass
+            time.sleep(10)
     raise TimeoutError("Server failed to start within the timeout period.")
 
 
diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
index ede25b1d4b2..7c208e84b93 100644
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -1,6 +1,8 @@
 import json
 import os
+import subprocess
 import unittest
+import warnings
 from datetime import datetime
 from types import SimpleNamespace
 
@@ -18,23 +20,23 @@
 )
 
 MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.8316,
-    "mistralai/Mistral-7B-Instruct-v0.3": 0.5861,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672,
-    "google/gemma-2-27b-it": 0.9227,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.9623,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.8791,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672,
-    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356,
-    "neuralmagic/gemma-2-2b-it-FP8": 0.6059,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504,
-    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.83,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
+    "google/gemma-2-27b-it": 0.92,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.96,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
+    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
+    "neuralmagic/gemma-2-2b-it-FP8": 0.60,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
 }
 
 
@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
         base_url,
         timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
         other_args=other_args,
+        return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
     )
     return process
 
@@ -132,6 +135,9 @@ def tearDown(self):
             kill_child_process(self.process.pid, include_self=True)
 
     def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
         is_first = True
         all_results = []