huggingface · baptistecolle · Jan 8, 2025 · Dec 6, 2024 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml b/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml
@@ -0,0 +1,42 @@
+name: Optimum TPU / Test TGI on TPU / Integration Tests
+
+on:
+  schedule:
+    - cron: '0 4 * * *' # run at 4 AM UTC
+    # This can be used to allow manually triggering nightlies from the web interface
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  integration-tests:
+    name: Run TGI Integration Tests
+    runs-on:
+      group: gcp-ct5lp-hightpu-8t
+
+    env:
+      PJRT_DEVICE: TPU
+      HF_HUB_CACHE: /mnt/hf_cache/cache_huggingface
+      HF_TOKEN: ${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }}
+      TPU_ENV: ${{ vars.V5_LITEPOD_8_ENV}}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y python3 python3-pip
+          sudo ln -s /usr/bin/python3 /usr/bin/python
+
+      # To build the docker image in the ci, we need to use the network host option
+      - name: Build TGI Docker Image
+        run: |
+          make tpu-tgi NETWORK=host
+
+      - name: Run integration tests
+        run: |
+          make tgi_docker_test
diff --git a/Makefile b/Makefile
@@ -42,11 +42,14 @@ clean:
 	rm -rf dist deps
 	make -C text-generation-inference/server/ clean
 
+# normal usage: make tpu-tgi
+# ci usage: make tpu-tgi NETWORK=host, to build the docker image with the network host option
 tpu-tgi:
 	docker build --rm -f text-generation-inference/docker/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
 	             --build-arg TGI_VERSION=$(TGI_VERSION) \
 	             --ulimit nofile=100000:100000 \
+				 $(if $(NETWORK),--network $(NETWORK),) \
 	             -t huggingface/optimum-tpu:$(VERSION)-tgi .
 	docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest
 
@@ -111,6 +114,6 @@ tgi_test: test_installs tgi_server
 	                               -exec python -m pip install --force-reinstall {} \;
 	python -m pytest -sv text-generation-inference/tests -m torch_xla
 
-tgi_docker_test: tpu-tgi
+tgi_docker_test:
 	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
 	python -m pytest -sv text-generation-inference/integration-tests
diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
@@ -23,13 +23,43 @@
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest")
 HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+TPU_ENV = os.getenv("TPU_ENV")
 
 logger.add(
     sys.stderr,
     format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
     level="INFO"
 )
 
+def validate_ci_tpu_env_format(env_string: str) -> bool:
+    """
+    Validate that CI TPU environment string follows '--env Argument' pattern.
+    Returns True if valid, False otherwise.
+    """        
+    parts = env_string.split()
+    return len(parts) % 2 == 0 and all(
+        parts[i] == "--env" and not parts[i + 1].startswith("--env")
+        for i in range(0, len(parts), 2)
+    )
+
+def process_ci_tpu_env_vars(env_string: str) -> dict:
+    """
+    Process CI TPU environment string and return dictionary of environment variables.
+    """
+    env_vars = {}
+    # Extract variables from string
+    tpu_vars = [x.strip() for x in env_string.split('--env') if x.strip()]
+
+    # Process each variable
+    for var in tpu_vars:
+        env_value = os.environ.get(var, "")
+        env_vars[var] = env_value
+        # Log if environment variable is not set
+        if not env_value:
+            logger.warning(f"TPU environment variable {var} is not set")
+
+    return env_vars
+
 
 def cleanup_handler(signum, frame):
     logger.info("\nCleaning up containers due to shutdown, please wait...")
@@ -85,7 +115,8 @@ async def health(self, timeout: int = 60):
                 if attempt == timeout - 1:
                     logger.error(f"Health check failed after {timeout}s: {str(e)}")
                     raise RuntimeError(f"Health check failed: {str(e)}")
-                logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
+                if attempt % 10 == 0:  # Only log every 10th attempt
+                    logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
                 time.sleep(1)
             except Exception as e:
                 logger.error(f"Unexpected error during health check: {str(e)}")
@@ -168,17 +199,30 @@ def docker_launcher(
 
         env = {
             "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
-            "HF_HUB_ENABLE_HF_TRANSFER": "0"
+            "HF_HUB_ENABLE_HF_TRANSFER": "0",
+            "PJRT_DEVICE": "TPU"
         }
         env.update(MODEL_CONFIGS[model_name]["env_config"].copy())
 
-
         # Add model_id to env
         env["MODEL_ID"] = model_id
 
         if HF_TOKEN is not None:
             env["HF_TOKEN"] = HF_TOKEN
 
+        # Add TPU environment variables when running in CI
+        if TPU_ENV:
+            logger.info(f"TPU_ENV is set, adding specific TPU environment variables for the CI")
+            logger.debug(f"TPU_ENV: {TPU_ENV}")
+            # Validate TPU environment format
+            if not validate_ci_tpu_env_format(TPU_ENV):
+                raise ValueError("Invalid TPU environment format", TPU_ENV)
+
+            # Process TPU environment variables
+            tpu_env_vars = process_ci_tpu_env_vars(TPU_ENV)
+            env.update(tpu_env_vars)
+
+
         for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]:
             if var in os.environ:
                 env[var] = os.environ[var]

diff --git a/text-generation-inference/integration-tests/requirements.txt b/text-generation-inference/integration-tests/requirements.txt
@@ -16,3 +16,4 @@ pytest >= 7.4.0
 pytest-asyncio >= 0.21.1
 docker >= 6.1.3
 Levenshtein
+loguru
diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py
@@ -28,7 +28,7 @@
         "model_id": "google/gemma-2b-it",
         "sequence_length": 1024,
         "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
-        "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain",
+        "expected_sampling_output": "\n\n**Deep learning** is a subfield of machine learning that enables computers to learn from data without explicit programming",
         "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
         "args": [
             "--max-input-length", "512",