From 277222a9226a3c6f8ace941ec7dd326a3655d76a Mon Sep 17 00:00:00 2001
From: chyundunovDatamonsters <c.yundunov@datamonsters.com>
Date: Fri, 17 Jan 2025 19:26:59 +0700
Subject: [PATCH 1/3] General README.md - add deploy on AMD info (#1409)

Signed-off-by: Chingis Yundunov <YundunovCN@sibedge.com>
Co-authored-by: Chingis Yundunov <YundunovCN@sibedge.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
index 369ae70609..2dd403b45b 100644
--- a/README.md
+++ b/README.md
@@ -43,19 +43,19 @@ Deployment are based on released docker images by default, check [docker image l
 
 #### Deploy Examples
 
-| Use Case          | Docker Compose<br/>Deployment on Xeon                                          | Docker Compose<br/>Deployment on Gaudi                                       | Kubernetes with Helm Charts                                       | Kubernetes with GMC                                          |
-| ----------------- | ------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | ----------------------------------------------------------------- | ------------------------------------------------------------ |
-| ChatQnA           | [Xeon Instructions](ChatQnA/docker_compose/intel/cpu/xeon/README.md)           | [Gaudi Instructions](ChatQnA/docker_compose/intel/hpu/gaudi/README.md)       | [ChatQnA with Helm Charts](ChatQnA/kubernetes/helm/README.md)     | [ChatQnA with GMC](ChatQnA/kubernetes/gmc/README.md)         |
-| CodeGen           | [Xeon Instructions](CodeGen/docker_compose/intel/cpu/xeon/README.md)           | [Gaudi Instructions](CodeGen/docker_compose/intel/hpu/gaudi/README.md)       | [CodeGen with Helm Charts](CodeGen/kubernetes/helm/README.md)     | [CodeGen with GMC](CodeGen/kubernetes/gmc/README.md)         |
-| CodeTrans         | [Xeon Instructions](CodeTrans/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](CodeTrans/docker_compose/intel/hpu/gaudi/README.md)     | [CodeTrans with Helm Charts](CodeTrans/kubernetes/helm/README.md) | [CodeTrans with GMC](CodeTrans/kubernetes/gmc/README.md)     |
-| DocSum            | [Xeon Instructions](DocSum/docker_compose/intel/cpu/xeon/README.md)            | [Gaudi Instructions](DocSum/docker_compose/intel/hpu/gaudi/README.md)        | [DocSum with Helm Charts](DocSum/kubernetes/helm/README.md)       | [DocSum with GMC](DocSum/kubernetes/gmc/README.md)           |
-| SearchQnA         | [Xeon Instructions](SearchQnA/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](SearchQnA/docker_compose/intel/hpu/gaudi/README.md)     | Not Supported                                                     | [SearchQnA with GMC](SearchQnA/kubernetes/gmc/README.md)     |
-| FaqGen            | [Xeon Instructions](FaqGen/docker_compose/intel/cpu/xeon/README.md)            | [Gaudi Instructions](FaqGen/docker_compose/intel/hpu/gaudi/README.md)        | [FaqGen with Helm Charts](FaqGen/kubernetes/helm/README.md)       | [FaqGen with GMC](FaqGen/kubernetes/gmc/README.md)           |
-| Translation       | [Xeon Instructions](Translation/docker_compose/intel/cpu/xeon/README.md)       | [Gaudi Instructions](Translation/docker_compose/intel/hpu/gaudi/README.md)   | Not Supported                                                     | [Translation with GMC](Translation/kubernetes/gmc/README.md) |
-| AudioQnA          | [Xeon Instructions](AudioQnA/docker_compose/intel/cpu/xeon/README.md)          | [Gaudi Instructions](AudioQnA/docker_compose/intel/hpu/gaudi/README.md)      | [AudioQnA with Helm Charts](AudioQnA/kubernetes/helm/README.md)   | [AudioQnA with GMC](AudioQnA/kubernetes/gmc/README.md)       |
-| VisualQnA         | [Xeon Instructions](VisualQnA/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](VisualQnA/docker_compose/intel/hpu/gaudi/README.md)     | [VisualQnA with Helm Charts](VisualQnA/kubernetes/helm/README.md) | [VisualQnA with GMC](VisualQnA/kubernetes/gmc/README.md)     |
-| MultimodalQnA     | [Xeon Instructions](MultimodalQnA/docker_compose/intel/cpu/xeon/README.md)     | [Gaudi Instructions](MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md) | Not supported                                                     | Not supported                                                |
-| ProductivitySuite | [Xeon Instructions](ProductivitySuite/docker_compose/intel/cpu/xeon/README.md) | Not Supported                                                                | Not Supported                                                     | Not Supported                                                |
+| Use Case          | Docker Compose<br/>Deployment on Xeon                                          | Docker Compose<br/>Deployment on Gaudi                                       | Docker Compost<br/>Deployment on ROCm                                    | Kubernetes with Helm Charts                                       | Kubernetes with GMC                                          |
+| ----------------- | ------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------ | ----------------------------------------------------------------- | ------------------------------------------------------------ |
+| ChatQnA           | [Xeon Instructions](ChatQnA/docker_compose/intel/cpu/xeon/README.md)           | [Gaudi Instructions](ChatQnA/docker_compose/intel/hpu/gaudi/README.md)       | [ROCm Instructions](ChatQnA/docker_compose/amd/gpu/rocm/README.md)       | [ChatQnA with Helm Charts](ChatQnA/kubernetes/helm/README.md)     | [ChatQnA with GMC](ChatQnA/kubernetes/gmc/README.md)         |
+| CodeGen           | [Xeon Instructions](CodeGen/docker_compose/intel/cpu/xeon/README.md)           | [Gaudi Instructions](CodeGen/docker_compose/intel/hpu/gaudi/README.md)       | [ROCm Instructions](CodeGen/docker_compose/amd/gpu/rocm/README.md)       | [CodeGen with Helm Charts](CodeGen/kubernetes/helm/README.md)     | [CodeGen with GMC](CodeGen/kubernetes/gmc/README.md)         |
+| CodeTrans         | [Xeon Instructions](CodeTrans/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](CodeTrans/docker_compose/intel/hpu/gaudi/README.md)     | [ROCm Instructions](CodeTrans/docker_compose/amd/gpu/rocm/README.md)     | [CodeTrans with Helm Charts](CodeTrans/kubernetes/helm/README.md) | [CodeTrans with GMC](CodeTrans/kubernetes/gmc/README.md)     |
+| DocSum            | [Xeon Instructions](DocSum/docker_compose/intel/cpu/xeon/README.md)            | [Gaudi Instructions](DocSum/docker_compose/intel/hpu/gaudi/README.md)        | [ROCm Instructions](DocSum/docker_compose/amd/gpu/rocm/README.md)        | [DocSum with Helm Charts](DocSum/kubernetes/helm/README.md)       | [DocSum with GMC](DocSum/kubernetes/gmc/README.md)           |
+| SearchQnA         | [Xeon Instructions](SearchQnA/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](SearchQnA/docker_compose/intel/hpu/gaudi/README.md)     | Not Supported                                                            | Not Supported                                                     | [SearchQnA with GMC](SearchQnA/kubernetes/gmc/README.md)     |
+| FaqGen            | [Xeon Instructions](FaqGen/docker_compose/intel/cpu/xeon/README.md)            | [Gaudi Instructions](FaqGen/docker_compose/intel/hpu/gaudi/README.md)        | [ROCm Instructions](FaqGen/docker_compose/amd/gpu/rocm/README.md)        | [FaqGen with Helm Charts](FaqGen/kubernetes/helm/README.md)       | [FaqGen with GMC](FaqGen/kubernetes/gmc/README.md)           |
+| Translation       | [Xeon Instructions](Translation/docker_compose/intel/cpu/xeon/README.md)       | [Gaudi Instructions](Translation/docker_compose/intel/hpu/gaudi/README.md)   | [ROCm Instructions](Translation/docker_compose/amd/gpu/rocm/README.md)   | Not Supported                                                     | [Translation with GMC](Translation/kubernetes/gmc/README.md) |
+| AudioQnA          | [Xeon Instructions](AudioQnA/docker_compose/intel/cpu/xeon/README.md)          | [Gaudi Instructions](AudioQnA/docker_compose/intel/hpu/gaudi/README.md)      | [ROCm Instructions](AudioQnA/docker_compose/amd/gpu/rocm/README.md)      | [AudioQnA with Helm Charts](AudioQnA/kubernetes/helm/README.md)   | [AudioQnA with GMC](AudioQnA/kubernetes/gmc/README.md)       |
+| VisualQnA         | [Xeon Instructions](VisualQnA/docker_compose/intel/cpu/xeon/README.md)         | [Gaudi Instructions](VisualQnA/docker_compose/intel/hpu/gaudi/README.md)     | [ROCm Instructions](VisualQnA/docker_compose/amd/gpu/rocm/README.md)     | [VisualQnA with Helm Charts](VisualQnA/kubernetes/helm/README.md) | [VisualQnA with GMC](VisualQnA/kubernetes/gmc/README.md)     |
+| MultimodalQnA     | [Xeon Instructions](MultimodalQnA/docker_compose/intel/cpu/xeon/README.md)     | [Gaudi Instructions](MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md) | [ROCm Instructions](MultimodalQnA/docker_compose/amd/gpu/rocm/README.md) | Not supported                                                     | Not supported                                                |
+| ProductivitySuite | [Xeon Instructions](ProductivitySuite/docker_compose/intel/cpu/xeon/README.md) | Not Supported                                                                | Not Supported                                                            | Not Supported                                                     | Not Supported                                                |
 
 ## Supported Examples
 

From 00e9da9ced6a703f44c1932468808e4c26b832f7 Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Fri, 17 Jan 2025 20:46:38 +0800
Subject: [PATCH 2/3] [ChatQnA] Switch to vLLM as default llm backend on Gaudi
 (#1404)

Switching from TGI to vLLM as the default LLM serving backend on Gaudi for the ChatQnA example to enhance the perf.

https://github.com/opea-project/GenAIExamples/issues/1213
Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>
---
 ChatQnA/README.md                             |  13 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  49 ++--
 .../intel/hpu/gaudi/compose.yaml              |  48 +---
 .../intel/hpu/gaudi/compose_guardrails.yaml   |  22 +-
 .../{compose_vllm.yaml => compose_tgi.yaml}   |  46 +++-
 .../hpu/gaudi/compose_without_rerank.yaml     |  24 +-
 .../tests/test_compose_guardrails_on_gaudi.sh |  43 ++--
 ChatQnA/tests/test_compose_on_gaudi.sh        | 125 +++-------
 ChatQnA/tests/test_compose_tgi_on_gaudi.sh    | 235 ++++++++++++++++++
 ChatQnA/tests/test_compose_vllm_on_gaudi.sh   | 183 --------------
 .../test_compose_without_rerank_on_gaudi.sh   |  24 +-
 11 files changed, 411 insertions(+), 401 deletions(-)
 rename ChatQnA/docker_compose/intel/hpu/gaudi/{compose_vllm.yaml => compose_tgi.yaml} (77%)
 create mode 100644 ChatQnA/tests/test_compose_tgi_on_gaudi.sh
 delete mode 100644 ChatQnA/tests/test_compose_vllm_on_gaudi.sh

diff --git a/ChatQnA/README.md b/ChatQnA/README.md
index 5db1a6aa35..728267197e 100644
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -202,7 +202,7 @@ Gaudi default compose.yaml
 | Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
 | Retriever | Langchain, Redis | Xeon | 7000 | /v1/retrieval |
 | Reranking | Langchain, TEI | Gaudi | 8000 | /v1/reranking |
-| LLM | Langchain, TGI | Gaudi | 9000 | /v1/chat/completions |
+| LLM | Langchain, vLLM | Gaudi | 9000 | /v1/chat/completions |
 | Dataprep | Redis, Langchain | Xeon | 6007 | /v1/dataprep |
 
 ### Required Models
@@ -266,16 +266,21 @@ Refer to the [Intel Technology enabling for Openshift readme](https://github.com
 
 ### Check Service Status
 
-Before consuming ChatQnA Service, make sure the TGI/vLLM service is ready (which takes up to 2 minutes to start).
+Before consuming ChatQnA Service, make sure the vLLM/TGI service is ready, which takes some time.
 
 ```bash
+# vLLM example
+docker logs vllm-gaudi-server 2>&1 | grep complete
 # TGI example
-docker logs tgi-service | grep Connected
+docker logs tgi-gaudi-server | grep Connected
 ```
 
-Consume ChatQnA service until you get the TGI response like below.
+Consume ChatQnA service until you get the response like below.
 
 ```log
+# vLLM
+INFO: Application startup complete.
+# TGI
 2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
 ```
 
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index 85b0338549..aa0e150fb1 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -1,6 +1,8 @@
 # Build MegaService of ChatQnA on Gaudi
 
-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as embedding, retriever, rerank, and llm. We will publish the Docker images to Docker Hub, it will simplify the deployment process for this service.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline, leveraging guardrails, or using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page.
 
 Quick Start:
 
@@ -184,7 +186,7 @@ By default, the embedding, reranking and LLM models are set to a default value a
 
 Change the `xxx_MODEL_ID` below for your needs.
 
-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. TGI can load the models either online or offline as described below:
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
 
 1. Online
 
@@ -192,7 +194,10 @@ For users in China who are unable to download models directly from Huggingface,
    export HF_TOKEN=${your_hf_token}
    export HF_ENDPOINT="https://hf-mirror.com"
    model_name="Intel/neural-chat-7b-v3-3"
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model_name --max-input-tokens 1024 --max-total-tokens 2048
+   # Start vLLM LLM Service
+   docker run -p 8007:80 -v ./data:/data --name vllm-gaudi-server -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e VLLM_TORCH_PROFILER_DIR="/mnt" --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+   # Start TGI LLM Service
+   docker run -p 8005:80 -v ./data:/data --name tgi-gaudi-server -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model_name --max-input-tokens 1024 --max-total-tokens 2048
    ```
 
 2. Offline
@@ -201,12 +206,15 @@ For users in China who are unable to download models directly from Huggingface,
 
    - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
 
-   - Run the following command to start TGI service.
+   - Run the following command to start the LLM service.
 
      ```bash
      export HF_TOKEN=${your_hf_token}
      export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name tgi_service --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id /data --max-input-tokens 1024 --max-total-tokens 2048
+     # Start vLLM LLM Service
+     docker run -p 8007:80 -v $model_path:/data --name vllm-gaudi-server --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e VLLM_TORCH_PROFILER_DIR="/mnt" --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model /data --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+     # Start TGI LLM Service
+     docker run -p 8005:80 -v $model_path:/data --name tgi-gaudi-server --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id /data --max-input-tokens 1024 --max-total-tokens 2048
      ```
 
 ### Setup Environment Variables
@@ -242,7 +250,7 @@ For users in China who are unable to download models directly from Huggingface,
 cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
 ```
 
-If use tgi for llm backend.
+If use vLLM as the LLM serving backend.
 
 ```bash
 # Start ChatQnA with Rerank Pipeline
@@ -251,10 +259,10 @@ docker compose -f compose.yaml up -d
 docker compose -f compose_without_rerank.yaml up -d
 ```
 
-If use vllm for llm backend.
+If use TGI as the LLM serving backend.
 
 ```bash
-docker compose -f compose_vllm.yaml up -d
+docker compose -f compose_tgi.yaml up -d
 ```
 
 If you want to enable guardrails microservice in the pipeline, please follow the below command instead:
@@ -309,35 +317,40 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
 
 4. LLM backend Service
 
-   In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
 
    Try the command below to check whether the LLM serving is ready.
 
    ```bash
-   docker logs tgi-gaudi-server | grep Connected
+   # vLLM service
+   docker logs vllm-gaudi-server 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
    ```
 
+   ```bash
+   # TGI service
+   docker logs tgi-gaudi-server | grep Connected
    If the service is ready, you will get the response like below.
-
-   ```
    2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
    ```
 
    Then try the `cURL` command below to validate services.
 
    ```bash
-   # TGI service
-   curl http://${host_ip}:8005/v1/chat/completions \
+   # vLLM Service
+   curl http://${host_ip}:8007/v1/chat/completions \
      -X POST \
      -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
      -H 'Content-Type: application/json'
    ```
 
    ```bash
-   # vLLM Service
-   curl http://${host_ip}:8007/v1/chat/completions \
-     -H "Content-Type: application/json" \
-     -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+   # TGI service
+   curl http://${host_ip}:8005/v1/chat/completions \
+     -X POST \
+     -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+     -H 'Content-Type: application/json'
    ```
 
 5. MegaService
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index cc75704aef..ddd1afadeb 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -25,7 +25,6 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
   tei-embedding-service:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
     container_name: tei-embedding-gaudi-server
@@ -38,7 +37,7 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
   retriever:
     image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
     container_name: retriever-redis-server
@@ -56,9 +55,6 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
-      LOGFLAG: ${LOGFLAG}
-      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
     restart: unless-stopped
   tei-reranking-service:
     image: ghcr.io/huggingface/tei-gaudi:1.5.0
@@ -80,47 +76,28 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    container_name: tgi-gaudi-server
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
     ports:
-      - "8005:80"
+      - "8007:80"
     volumes:
       - "./data:/data"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
-  jaeger:
-    image: jaegertracing/all-in-one:latest
-    container_name: jaeger
-    ports:
-      - "16686:16686"
-      - "4317:4317"
-      - "4318:4318"
-      - "9411:9411"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      COLLECTOR_ZIPKIN_HOST_PORT: 9411
-    restart: unless-stopped
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
@@ -129,7 +106,7 @@ services:
       - tei-embedding-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -142,11 +119,10 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
-      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
     ipc: host
     restart: always
   chatqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
index 4f062dce3f..936be4045c 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -118,9 +118,9 @@ services:
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    container_name: tgi-gaudi-server
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
     ports:
       - "8008:80"
     volumes:
@@ -129,20 +129,16 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
     container_name: chatqna-gaudi-guardrails-server
@@ -153,7 +149,7 @@ services:
       - tei-embedding-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -168,7 +164,7 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
similarity index 77%
rename from ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
rename to ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
index 5c7bd8e0d2..cc75704aef 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -25,6 +25,7 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
   tei-embedding-service:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
     container_name: tei-embedding-gaudi-server
@@ -37,7 +38,7 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
   retriever:
     image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
     container_name: retriever-redis-server
@@ -55,6 +56,7 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
       LOGFLAG: ${LOGFLAG}
       RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
     restart: unless-stopped
@@ -78,28 +80,47 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  vllm-service:
-    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
-    container_name: vllm-gaudi-server
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    container_name: tgi-gaudi-server
     ports:
-      - "8007:80"
+      - "8005:80"
     volumes:
       - "./data:/data"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
@@ -108,7 +129,7 @@ services:
       - tei-embedding-service
       - retriever
       - tei-reranking-service
-      - vllm-service
+      - tgi-service
     ports:
       - "8888:8888"
     environment:
@@ -121,10 +142,11 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=vllm-service
+      - LLM_SERVER_HOST_IP=tgi-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
     ipc: host
     restart: always
   chatqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
index 8da9ecc0e4..8b800525e9 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -58,31 +58,27 @@ services:
       LOGFLAG: ${LOGFLAG}
       RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
     restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    container_name: tgi-gaudi-server
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
     ports:
-      - "8005:80"
+      - "8007:80"
     volumes:
       - "./data:/data"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
@@ -90,7 +86,7 @@ services:
       - redis-vector-db
       - tei-embedding-service
       - retriever
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -101,7 +97,7 @@ services:
       - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
       - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
       - RETRIEVER_SERVICE_HOST_IP=retriever
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
index 8fe8dc733f..f36882a82c 100644
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -17,9 +17,10 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork && git checkout v0.6.4.post2+Gaudi-1.19.0 && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-guardrails chatqna-ui dataprep-redis retriever guardrails nginx"
+    service_list="chatqna-guardrails chatqna-ui dataprep-redis retriever vllm-gaudi guardrails nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -41,9 +42,10 @@ function start_services() {
     # Start Docker Containers
     docker compose -f compose_guardrails.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
     n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs tgi-gaudi-server > tgi_service_start.log
-        if grep -q Connected tgi_service_start.log; then
+    until [[ "$n" -ge 160 ]]; do
+        echo "n=$n"
+        docker logs vllm-gaudi-server > vllm_service_start.log
+        if grep -q "Warmup finished" vllm_service_start.log; then
             break
         fi
         sleep 5s
@@ -51,18 +53,19 @@ function start_services() {
     done
 
     # Make sure tgi guardrails service is ready
-    n=0
-    until [[ "$n" -ge 100 ]]; do
+    m=0
+    until [[ "$m" -ge 160 ]]; do
+        echo "m=$m"
         docker logs tgi-guardrails-server > tgi_guardrails_service_start.log
         if grep -q Connected tgi_guardrails_service_start.log; then
             break
         fi
         sleep 5s
-        n=$((n+1))
+        m=$((m+1))
     done
 }
 
-function validate_services() {
+function validate_service() {
     local URL="$1"
     local EXPECTED_RESULT="$2"
     local SERVICE_NAME="$3"
@@ -97,7 +100,7 @@ function validate_microservices() {
     # Check if the microservices are running correctly.
 
     # tei for embedding service
-    validate_services \
+    validate_service \
         "${ip_address}:8090/embed" \
         "[[" \
         "tei-embedding" \
@@ -108,7 +111,7 @@ function validate_microservices() {
 
     # retrieval microservice
     test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_services \
+    validate_service \
         "${ip_address}:7000/v1/retrieval" \
         "retrieved_docs" \
         "retrieval" \
@@ -116,23 +119,23 @@ function validate_microservices() {
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
     # tei for rerank microservice
-    validate_services \
+    validate_service \
         "${ip_address}:8808/rerank" \
         '{"index":1,"score":' \
         "tei-rerank" \
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # tgi for llm service
-    validate_services \
-        "${ip_address}:8008/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-gaudi-server" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+    # vllm for llm service
+    validate_service \
+        "${ip_address}:8008/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-gaudi-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
 
     # guardrails microservice
-    validate_services \
+    validate_service \
         "${ip_address}:9090/v1/guardrails" \
         "Violated policies" \
         "guardrails" \
@@ -142,7 +145,7 @@ function validate_microservices() {
 
 function validate_megaservice() {
     # Curl the Mega Service
-    validate_services \
+    validate_service \
         "${ip_address}:8888/v1/chatqna" \
         "data: " \
         "mega-chatqna" \
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index 22eccb2d5d..f9d0e48d0d 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -17,15 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork && git checkout v0.6.4.post2+Gaudi-1.19.0 && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever nginx"
+    service_list="chatqna chatqna-ui dataprep-redis retriever vllm-gaudi nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
     docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
-
     docker images && sleep 1s
 }
 
@@ -33,24 +32,20 @@ function start_services() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
-    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
-    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
 
     # Start Docker Containers
-    sed -i "s|container_name: chatqna-gaudi-backend-server|container_name: chatqna-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
     docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
     n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+    until [[ "$n" -ge 160 ]]; do
+        echo "n=$n"
+        docker logs vllm-gaudi-server > vllm_service_start.log
+        if grep -q "Warmup finished" vllm_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 5s
         n=$((n+1))
     done
 }
@@ -62,38 +57,24 @@ function validate_service() {
     local DOCKER_NAME="$4"
     local INPUT_DATA="$5"
 
-    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
-        cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
-    else
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    fi
-    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
-    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
-    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
 
-    # check response status
-    if [ "$HTTP_STATUS" -ne "200" ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        exit 1
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
     else
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-    fi
-    # check response body
-    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
         exit 1
-    else
-        echo "[ $SERVICE_NAME ] Content is as expected."
     fi
-
     sleep 1s
 }
 
@@ -103,48 +84,19 @@ function validate_microservices() {
     # tei for embedding service
     validate_service \
         "${ip_address}:8090/embed" \
-        "[[" \
+        "\[\[" \
         "tei-embedding" \
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
-    # test /v1/dataprep upload file
-    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_file" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep upload link
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_link" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/get_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/get_file" \
-        '{"name":' \
-        "dataprep_get" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/delete_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/delete_file" \
-        '{"status":true}' \
-        "dataprep_del" \
-        "dataprep-redis-server"
-
     # retrieval microservice
     test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
     validate_service \
         "${ip_address}:7000/v1/retrieval" \
-        "retrieved_docs" \
-        "retrieval-microservice" \
+        " " \
+        "retrieval" \
         "retriever-redis-server" \
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
@@ -156,22 +108,21 @@ function validate_microservices() {
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # tgi for llm service
+    # vllm for llm service
     validate_service \
-        "${ip_address}:8005/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-gaudi-server" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
+        "${ip_address}:8007/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-gaudi-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
 }
 
 function validate_megaservice() {
     # Curl the Mega Service
     validate_service \
         "${ip_address}:8888/v1/chatqna" \
-        "data: " \
-        "chatqna-megaservice" \
+        "data:" \
+        "mega-chatqna" \
         "chatqna-gaudi-backend-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
 
@@ -207,7 +158,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml down
 }
 
 function main() {
@@ -220,13 +171,9 @@ function main() {
     duration=$((end_time-start_time))
     echo "Mega service start duration is $duration s"
 
-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        validate_megaservice
-        validate_frontend
-    fi
+    validate_microservices
+    validate_megaservice
+    # validate_frontend
 
     stop_docker
     echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
new file mode 100644
index 0000000000..c6f496a19f
--- /dev/null
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH/docker_image_build
+    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep-redis retriever nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
+
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export INDEX_NAME="rag-redis"
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+
+    # Start Docker Containers
+    sed -i "s|container_name: chatqna-gaudi-backend-server|container_name: chatqna-gaudi-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    n=0
+    until [[ "$n" -ge 500 ]]; do
+        docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+            break
+        fi
+        sleep 1s
+        n=$((n+1))
+    done
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:8090/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-gaudi-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get_file" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete_file" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        "retrieved_docs" \
+        "retrieval-microservice" \
+        "retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-gaudi-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # tgi for llm service
+    validate_service \
+        "${ip_address}:8005/v1/chat/completions" \
+        "content" \
+        "tgi-llm" \
+        "tgi-gaudi-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:8888/v1/chatqna" \
+        "data: " \
+        "chatqna-megaservice" \
+        "chatqna-gaudi-backend-server" \
+        '{"messages": "What is the revenue of Nike in 2023?"}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    docker compose -f compose_tgi.yaml down
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s"
+
+    if [ "${mode}" == "perf" ]; then
+        python3 $WORKPATH/tests/chatqna_benchmark.py
+    elif [ "${mode}" == "" ]; then
+        validate_microservices
+        validate_megaservice
+        validate_frontend
+    fi
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
deleted file mode 100644
index 75af30e149..0000000000
--- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
+++ /dev/null
@@ -1,183 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    cd $WORKPATH/docker_image_build
-    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork && git checkout v0.6.4.post2+Gaudi-1.19.0 && cd ../
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever vllm-gaudi nginx"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export INDEX_NAME="rag-redis"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-    # Start Docker Containers
-    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-    n=0
-    until [[ "$n" -ge 160 ]]; do
-        echo "n=$n"
-        docker logs vllm-gaudi-server > vllm_service_start.log
-        if grep -q "Warmup finished" vllm_service_start.log; then
-            break
-        fi
-        sleep 5s
-        n=$((n+1))
-    done
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 1s
-}
-
-function validate_microservices() {
-    # Check if the microservices are running correctly.
-
-    # tei for embedding service
-    validate_services \
-        "${ip_address}:8090/embed" \
-        "\[\[" \
-        "tei-embedding" \
-        "tei-embedding-gaudi-server" \
-        '{"inputs":"What is Deep Learning?"}'
-
-    sleep 1m # retrieval can't curl as expected, try to wait for more time
-
-    # retrieval microservice
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_services \
-        "${ip_address}:7000/v1/retrieval" \
-        " " \
-        "retrieval" \
-        "retriever-redis-server" \
-        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
-
-    # tei for rerank microservice
-    validate_services \
-        "${ip_address}:8808/rerank" \
-        '{"index":1,"score":' \
-        "tei-rerank" \
-        "tei-reranking-gaudi-server" \
-        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
-
-    # vllm for llm service
-    validate_services \
-        "${ip_address}:8007/v1/completions" \
-        "text" \
-        "vllm-llm" \
-        "vllm-gaudi-server" \
-        '{"model": "Intel/neural-chat-7b-v3-3","prompt": "What is Deep Learning?","max_tokens": 32,"temperature": 0}'
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:8888/v1/chatqna" \
-        "data:" \
-        "mega-chatqna" \
-        "chatqna-gaudi-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs=22.6.0 -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose -f compose_vllm.yaml down
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s"
-
-    validate_microservices
-    validate_megaservice
-    # validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
index e1187bfcf9..34e1af18e0 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -17,12 +17,12 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork && git checkout v0.6.4.post2+Gaudi-1.19.0 && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever nginx"
+    service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever vllm-gaudi nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
     docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
 
@@ -40,9 +40,9 @@ function start_services() {
     docker compose -f compose_without_rerank.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
 
     n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+    until [[ "$n" -ge 160 ]]; do
+        docker logs vllm-gaudi-server > vllm_service_start.log
+        if grep -q "Warmup finished" vllm_service_start.log; then
             break
         fi
         sleep 5s
@@ -143,13 +143,13 @@ function validate_microservices() {
         "retriever-redis-server" \
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
-    # tgi for llm service
+    # vllm for llm service
     validate_service \
-        "${ip_address}:8005/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-gaudi-server" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+        "${ip_address}:8007/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-gaudi-server" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
 }
 
 function validate_megaservice() {
@@ -193,7 +193,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose  -f compose_without_rerank.yaml stop && docker compose -f compose_without_rerank.yaml rm -f
+    docker compose  -f compose_without_rerank.yaml down
 }
 
 function main() {

From 742cb6ddd368db120274f2e7bd0847a89af670b9 Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Fri, 17 Jan 2025 20:48:19 +0800
Subject: [PATCH 3/3] [ChatQnA] Switch to vLLM as default llm backend on Xeon
 (#1403)

Switching from TGI to vLLM as the default LLM serving backend on Xeon for the ChatQnA example to enhance the perf.

https://github.com/opea-project/GenAIExamples/issues/1213
Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>
---
 .../docker_compose/intel/cpu/xeon/README.md   |  46 ++++---
 .../intel/cpu/xeon/README_pinecone.md         |  21 +--
 .../intel/cpu/xeon/README_qdrant.md           |  22 +--
 .../intel/cpu/xeon/compose.yaml               |  19 ++-
 .../intel/cpu/xeon/compose_pinecone.yaml      |  18 +--
 .../intel/cpu/xeon/compose_qdrant.yaml        |  20 +--
 .../{compose_vllm.yaml => compose_tgi.yaml}   |  19 +--
 .../cpu/xeon/compose_without_rerank.yaml      |  18 +--
 ChatQnA/tests/test_compose_on_xeon.sh         | 118 +++++-----------
 .../tests/test_compose_pinecone_on_xeon.sh    |  27 ++--
 ChatQnA/tests/test_compose_qdrant_on_xeon.sh  |  37 ++---
 ...on_xeon.sh => test_compose_tgi_on_xeon.sh} | 126 ++++++++++++------
 .../test_compose_without_rerank_on_xeon.sh    |  22 +--
 13 files changed, 259 insertions(+), 254 deletions(-)
 rename ChatQnA/docker_compose/intel/cpu/xeon/{compose_vllm.yaml => compose_tgi.yaml} (92%)
 rename ChatQnA/tests/{test_compose_vllm_on_xeon.sh => test_compose_tgi_on_xeon.sh} (50%)

diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index f908470d71..750cca5887 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,6 +1,8 @@
 # Build Mega Service of ChatQnA on Xeon
 
-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline and using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page. Besides, refer to [Build with Pinecone VectorDB](./README_pinecone.md) and [Build with Qdrant VectorDB](./README_qdrant.md) for other deployment variants.
 
 Quick Start:
 
@@ -186,7 +188,7 @@ By default, the embedding, reranking and LLM models are set to a default value a
 
 Change the `xxx_MODEL_ID` below for your needs.
 
-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. TGI can load the models either online or offline as described below:
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
 
 1. Online
 
@@ -194,6 +196,9 @@ For users in China who are unable to download models directly from Huggingface,
    export HF_TOKEN=${your_hf_token}
    export HF_ENDPOINT="https://hf-mirror.com"
    model_name="Intel/neural-chat-7b-v3-3"
+   # Start vLLM LLM Service
+   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   # Start TGI LLM Service
    docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
    ```
 
@@ -203,12 +208,15 @@ For users in China who are unable to download models directly from Huggingface,
 
    - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
 
-   - Run the following command to start TGI service.
+   - Run the following command to start the LLM service.
 
      ```bash
      export HF_TOKEN=${your_hf_token}
      export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     # Start vLLM LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     # Start TGI LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
      ```
 
 ### Setup Environment Variables
@@ -246,7 +254,7 @@ For users in China who are unable to download models directly from Huggingface,
 cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 ```
 
-If use TGI backend.
+If use vLLM as the LLM serving backend.
 
 ```bash
 # Start ChatQnA with Rerank Pipeline
@@ -255,10 +263,10 @@ docker compose -f compose.yaml up -d
 docker compose -f compose_without_rerank.yaml up -d
 ```
 
-If use vLLM backend.
+If use TGI as the LLM serving backend.
 
 ```bash
-docker compose -f compose_vllm.yaml up -d
+docker compose -f compose_tgi.yaml up -d
 ```
 
 ### Validate Microservices
@@ -305,37 +313,34 @@ For details on how to verify the correctness of the response, refer to [how-to-v
 
 4. LLM backend Service
 
-   In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
 
    Try the command below to check whether the LLM serving is ready.
 
    ```bash
-   docker logs tgi-service | grep Connected
+   # vLLM service
+   docker logs vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
    ```
 
-   If the service is ready, you will get the response like below.
-
-   ```
+   ```bash
+   # TGI service
+   docker logs tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
    2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
    ```
 
    Then try the `cURL` command below to validate services.
 
    ```bash
-   # TGI service
+   # either vLLM or TGI service
    curl http://${host_ip}:9009/v1/chat/completions \
      -X POST \
      -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
      -H 'Content-Type: application/json'
    ```
 
-   ```bash
-   # vLLM Service
-   curl http://${host_ip}:9009/v1/chat/completions \
-     -H "Content-Type: application/json" \
-     -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-   ```
-
 5. MegaService
 
    ```bash
@@ -362,7 +367,6 @@ Or run this command to get the file on a terminal.
 
 ```bash
 wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
-
 ```
 
 Upload:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
index dce5b0a540..c87a0a81cf 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
@@ -1,6 +1,8 @@
 # Build Mega Service of ChatQnA on Xeon
 
-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component.
 
 Quick Start:
 
@@ -189,7 +191,7 @@ By default, the embedding, reranking and LLM models are set to a default value a
 
 Change the `xxx_MODEL_ID` below for your needs.
 
-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. TGI can load the models either online or offline as described below:
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM can load the models either online or offline as described below:
 
 1. Online
 
@@ -197,7 +199,7 @@ For users in China who are unable to download models directly from Huggingface,
    export HF_TOKEN=${your_hf_token}
    export HF_ENDPOINT="https://hf-mirror.com"
    model_name="Intel/neural-chat-7b-v3-3"
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
    ```
 
 2. Offline
@@ -206,12 +208,12 @@ For users in China who are unable to download models directly from Huggingface,
 
    - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
 
-   - Run the following command to start TGI service.
+   - Run the following command to start the LLM service.
 
      ```bash
      export HF_TOKEN=${your_hf_token}
      export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
      ```
 
 ### Setup Environment Variables
@@ -252,7 +254,7 @@ For users in China who are unable to download models directly from Huggingface,
 cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 ```
 
-If use TGI backend.
+If use vLLM backend.
 
 ```bash
 # Start ChatQnA with Rerank Pipeline
@@ -303,24 +305,23 @@ For details on how to verify the correctness of the response, refer to [how-to-v
 
 4. LLM backend Service
 
-   In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
 
    Try the command below to check whether the LLM serving is ready.
 
    ```bash
-   docker logs tgi-service | grep Connected
+   docker logs vllm-service 2>&1 | grep complete
    ```
 
    If the service is ready, you will get the response like below.
 
    ```text
-   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   INFO: Application startup complete.
    ```
 
    Then try the `cURL` command below to validate services.
 
    ```bash
-   # TGI service
    curl http://${host_ip}:9009/v1/chat/completions \
      -X POST \
      -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
index 6688f25370..a77ebf0f7d 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
@@ -1,6 +1,8 @@
 # Build Mega Service of ChatQnA (with Qdrant) on Xeon
 
-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component.
 
 ## 🚀 Apply Xeon Server on AWS
 
@@ -44,7 +46,7 @@ reranking
 =========
 Port 6046 - Open to 0.0.0.0/0
 
-tgi-service
+vllm-service
 ===========
 Port 6042 - Open to 0.0.0.0/0
 
@@ -170,7 +172,7 @@ export your_hf_api_token="Your_Huggingface_API_Token"
 **Append the value of the public IP address to the no_proxy list if you are in a proxy environment**
 
 ```
-export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-qdrant-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service
+export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-qdrant-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
 ```
 
 ```bash
@@ -233,23 +235,23 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-4. TGI Service
+4. LLM Backend Service
 
-   In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
 
-   Try the command below to check whether the TGI service is ready.
+   Try the command below to check whether the LLM service is ready.
 
    ```bash
-   docker logs ${CONTAINER_ID} | grep Connected
+   docker logs vllm-service 2>&1 | grep complete
    ```
 
    If the service is ready, you will get the response like below.
 
-   ```
-   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```text
+   INFO: Application startup complete.
    ```
 
-   Then try the `cURL` command below to validate TGI.
+   Then try the `cURL` command below to validate vLLM service.
 
    ```bash
    curl http://${host_ip}:6042/v1/chat/completions \
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 6e94a9f998..f34868b6de 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -74,32 +74,31 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
     ports:
       - "9009:80"
     volumes:
       - "./data:/data"
-    shm_size: 1g
+    shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - dataprep-redis-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -112,7 +111,7 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
index 022fe3b612..5378b581ef 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -68,22 +68,22 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
     ports:
       - "9009:80"
     volumes:
       - "./data:/data"
-    shm_size: 1g
+    shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
@@ -92,7 +92,7 @@ services:
       - dataprep-pinecone-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -105,7 +105,7 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LOGFLAG=${LOGFLAG}
       - LLM_MODEL=${LLM_MODEL_ID}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index af69531c21..c3a2d00dc8 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -53,7 +53,7 @@ services:
       QDRANT_HOST: qdrant-vector-db
       QDRANT_PORT: 6333
       INDEX_NAME: ${INDEX_NAME}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
       LOGFLAG: ${LOGFLAG}
       RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_QDRANT"
     restart: unless-stopped
@@ -73,22 +73,22 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
     ports:
       - "6042:80"
     volumes:
       - "./data:/data"
-    shm_size: 1g
+    shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
@@ -97,7 +97,7 @@ services:
       - tei-embedding-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8912:8888"
     environment:
@@ -111,7 +111,7 @@ services:
       - RETRIEVER_SERVICE_PORT=${RETRIEVER_SERVICE_PORT:-7000}
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
similarity index 92%
rename from ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
rename to ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index f34868b6de..6e94a9f998 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -74,31 +74,32 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  vllm-service:
-    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
-    container_name: vllm-service
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-service
     ports:
       - "9009:80"
     volumes:
       - "./data:/data"
-    shm_size: 128g
+    shm_size: 1g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      VLLM_TORCH_PROFILER_DIR: "/mnt"
-    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
+      - dataprep-redis-service
       - retriever
       - tei-reranking-service
-      - vllm-service
+      - tgi-service
     ports:
       - "8888:8888"
     environment:
@@ -111,7 +112,7 @@ services:
       - RETRIEVER_SERVICE_HOST_IP=retriever
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=vllm-service
+      - LLM_SERVER_HOST_IP=tgi-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index 92d7fcf7bc..dd675dd0dd 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -58,22 +58,22 @@ services:
       LOGFLAG: ${LOGFLAG}
       RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
     restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
     ports:
       - "9009:80"
     volumes:
       - "./data:/data"
-    shm_size: 1g
+    shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
@@ -82,7 +82,7 @@ services:
       - tei-embedding-service
       - dataprep-redis-service
       - retriever
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
@@ -93,7 +93,7 @@ services:
       - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
       - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
       - RETRIEVER_SERVICE_HOST_IP=retriever
-      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_HOST_IP=vllm-service
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index babca0cd43..1808dbd459 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -17,12 +17,12 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever nginx"
+    service_list="chatqna chatqna-ui dataprep-redis retriever vllm nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
@@ -33,21 +33,19 @@ function start_services() {
 
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 
     # Start Docker Containers
-    sed -i "s|container_name: chatqna-xeon-backend-server|container_name: chatqna-xeon-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
     docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
     n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+    until [[ "$n" -ge 100 ]]; do
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 5s
         n=$((n+1))
     done
 }
@@ -59,38 +57,24 @@ function validate_service() {
     local DOCKER_NAME="$4"
     local INPUT_DATA="$5"
 
-    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
-        cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
-    else
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    fi
-    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
-    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
-    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
 
-    # check response status
-    if [ "$HTTP_STATUS" -ne "200" ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        exit 1
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
     else
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-    fi
-    # check response body
-    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
         exit 1
-    else
-        echo "[ $SERVICE_NAME ] Content is as expected."
     fi
-
     sleep 1s
 }
 
@@ -100,48 +84,19 @@ function validate_microservices() {
     # tei for embedding service
     validate_service \
         "${ip_address}:6006/embed" \
-        "[[" \
+        "\[\[" \
         "tei-embedding" \
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
-    # test /v1/dataprep upload file
-    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_file" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep upload link
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_link" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/get_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/get_file" \
-        '{"name":' \
-        "dataprep_get" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/delete_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/delete_file" \
-        '{"status":true}' \
-        "dataprep_del" \
-        "dataprep-redis-server"
-
     # retrieval microservice
     test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
     validate_service \
         "${ip_address}:7000/v1/retrieval" \
-        "retrieved_docs" \
-        "retrieval-microservice" \
+        " " \
+        "retrieval" \
         "retriever-redis-server" \
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
@@ -153,29 +108,27 @@ function validate_microservices() {
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # tgi for llm service
+    # vllm for llm service
     validate_service \
-        "${ip_address}:9009/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
+        "${ip_address}:9009/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-service" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }
 
 function validate_megaservice() {
     # Curl the Mega Service
     validate_service \
         "${ip_address}:8888/v1/chatqna" \
-        "data: " \
-        "chatqna-megaservice" \
+        "data" \
+        "mega-chatqna" \
         "chatqna-xeon-backend-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
 
 }
 
 function validate_frontend() {
-    echo "[ TEST INFO ]: --------- frontend test started ---------"
     cd $WORKPATH/ui/svelte
     local conda_env_name="OPEA_e2e"
     export PATH=${HOME}/miniforge3/bin/:$PATH
@@ -184,8 +137,8 @@ function validate_frontend() {
     else
         conda create -n ${conda_env_name} python=3.12 -y
     fi
+
     source activate ${conda_env_name}
-    echo "[ TEST INFO ]: --------- conda env activated ---------"
 
     sed -i "s/localhost/$ip_address/g" playwright.config.ts
 
@@ -206,7 +159,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml down
 }
 
 function main() {
@@ -223,11 +176,8 @@ function main() {
         python3 $WORKPATH/tests/chatqna_benchmark.py
     elif [ "${mode}" == "" ]; then
         validate_microservices
-        echo "==== microservices validated ===="
         validate_megaservice
-        echo "==== megaservice validated ===="
-        validate_frontend
-        echo "==== frontend validated ===="
+        # validate_frontend
     fi
 
     stop_docker
diff --git a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
index 35c58f6754..d525cdb80a 100755
--- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
@@ -17,12 +17,12 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-pinecone retriever nginx"
+    service_list="chatqna chatqna-ui dataprep-pinecone retriever vllm nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
@@ -33,7 +33,7 @@ function start_services() {
     export no_proxy=${no_proxy},${ip_address}
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export PINECONE_API_KEY=${PINECONE_KEY_LANGCHAIN_TEST}
     export PINECONE_INDEX_NAME="langchain-test"
     export INDEX_NAME="langchain-test"
@@ -44,12 +44,12 @@ function start_services() {
     docker compose -f compose_pinecone.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
 
     n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+    until [[ "$n" -ge 100 ]]; do
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 5s
         n=$((n+1))
     done
 }
@@ -146,15 +146,14 @@ function validate_microservices() {
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
 
-    # tgi for llm service
+    # vllm for llm service
     echo "Validating llm service"
     validate_service \
-        "${ip_address}:9009/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
+        "${ip_address}:9009/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-service" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }
 
 function validate_megaservice() {
diff --git a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
index ee4b4efb0a..299a7def1a 100644
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -17,9 +17,10 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-qdrant retriever nginx"
+    service_list="chatqna chatqna-ui dataprep-qdrant retriever vllm nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker images && sleep 1s
@@ -40,8 +41,8 @@ function start_services() {
     docker compose -f compose_qdrant.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
     n=0
     until [[ "$n" -ge 100 ]]; do
-        docker logs tgi-service > tgi_service_start.log
-        if grep -q Connected tgi_service_start.log; then
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
         sleep 5s
@@ -49,7 +50,7 @@ function start_services() {
     done
 }
 
-function validate_services() {
+function validate_service() {
     local URL="$1"
     local EXPECTED_RESULT="$2"
     local SERVICE_NAME="$3"
@@ -91,7 +92,7 @@ function validate_microservices() {
     # Check if the microservices are running correctly.
 
     # tei for embedding service
-    validate_services \
+    validate_service \
         "${ip_address}:6040/embed" \
         "[[" \
         "tei-embedding" \
@@ -100,14 +101,14 @@ function validate_microservices() {
 
     # test /v1/dataprep upload file
     echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
-    validate_services \
+    validate_service \
         "${ip_address}:6043/v1/dataprep" \
         "Data preparation succeeded" \
         "dataprep_upload_file" \
         "dataprep-qdrant-server"
 
     # test upload link
-    validate_services \
+    validate_service \
         "${ip_address}:6043/v1/dataprep" \
         "Data preparation succeeded" \
         "dataprep_upload_link" \
@@ -115,7 +116,7 @@ function validate_microservices() {
 
     # retrieval microservice
     test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_services \
+    validate_service \
         "${ip_address}:6045/v1/retrieval" \
         "retrieved_docs" \
         "retrieval" \
@@ -123,25 +124,25 @@ function validate_microservices() {
         "{\"text\":\"What is Deep Learning?\",\"embedding\":${test_embedding}}"
 
     # tei for rerank microservice
-    validate_services \
+    validate_service \
         "${ip_address}:6041/rerank" \
         '{"index":1,"score":' \
         "tei-rerank" \
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # tgi for llm service
-    validate_services \
-        "${ip_address}:6042/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+    # vllm for llm service
+    validate_service \
+        "${ip_address}:6042/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-service" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }
 
 function validate_megaservice() {
     # Curl the Mega Service
-    validate_services \
+    validate_service \
         "${ip_address}:8912/v1/chatqna" \
         "data: " \
         "mega-chatqna" \
@@ -174,7 +175,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose -f compose_qdrant.yaml stop && docker compose -f compose_qdrant.yaml rm -f
+    docker compose -f compose_qdrant.yaml down
 }
 
 function main() {
diff --git a/ChatQnA/tests/test_compose_vllm_on_xeon.sh b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
similarity index 50%
rename from ChatQnA/tests/test_compose_vllm_on_xeon.sh
rename to ChatQnA/tests/test_compose_tgi_on_xeon.sh
index 6d95c68a91..ca2a7c3fda 100644
--- a/ChatQnA/tests/test_compose_vllm_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -17,13 +17,12 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-    git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever vllm nginx"
+    service_list="chatqna chatqna-ui dataprep-redis retriever nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
@@ -39,11 +38,13 @@ function start_services() {
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 
     # Start Docker Containers
-    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    sed -i "s|container_name: chatqna-xeon-backend-server|container_name: chatqna-xeon-backend-server\n    volumes:\n      - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
     n=0
     until [[ "$n" -ge 100 ]]; do
-        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
-        if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
+        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
             break
         fi
         sleep 5s
@@ -51,31 +52,45 @@ function start_services() {
     done
 }
 
-function validate_services() {
+function validate_service() {
     local URL="$1"
     local EXPECTED_RESULT="$2"
     local SERVICE_NAME="$3"
     local DOCKER_NAME="$4"
     local INPUT_DATA="$5"
 
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
 
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
 
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
         exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
     fi
+
     sleep 1s
 }
 
@@ -83,53 +98,83 @@ function validate_microservices() {
     # Check if the microservices are running correctly.
 
     # tei for embedding service
-    validate_services \
+    validate_service \
         "${ip_address}:6006/embed" \
-        "\[\[" \
+        "[[" \
         "tei-embedding" \
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get_file" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete_file" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
     # retrieval microservice
     test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_services \
+    validate_service \
         "${ip_address}:7000/v1/retrieval" \
-        " " \
-        "retrieval" \
+        "retrieved_docs" \
+        "retrieval-microservice" \
         "retriever-redis-server" \
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
     # tei for rerank microservice
-    validate_services \
+    validate_service \
         "${ip_address}:8808/rerank" \
         '{"index":1,"score":' \
         "tei-rerank" \
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # vllm for llm service
-    validate_services \
-        "${ip_address}:9009/v1/completions" \
-        "text" \
-        "vllm-llm" \
-        "vllm-service" \
-        '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
+    # tgi for llm service
+    validate_service \
+        "${ip_address}:9009/v1/chat/completions" \
+        "content" \
+        "tgi-llm" \
+        "tgi-service" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }
 
 function validate_megaservice() {
     # Curl the Mega Service
-    validate_services \
+    validate_service \
         "${ip_address}:8888/v1/chatqna" \
-        "data" \
-        "mega-chatqna" \
+        "data: " \
+        "chatqna-megaservice" \
         "chatqna-xeon-backend-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
 
 }
 
 function validate_frontend() {
+    echo "[ TEST INFO ]: --------- frontend test started ---------"
     cd $WORKPATH/ui/svelte
     local conda_env_name="OPEA_e2e"
     export PATH=${HOME}/miniforge3/bin/:$PATH
@@ -138,8 +183,8 @@ function validate_frontend() {
     else
         conda create -n ${conda_env_name} python=3.12 -y
     fi
-
     source activate ${conda_env_name}
+    echo "[ TEST INFO ]: --------- conda env activated ---------"
 
     sed -i "s/localhost/$ip_address/g" playwright.config.ts
 
@@ -160,7 +205,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose -f compose_vllm.yaml down
+    docker compose -f compose_tgi.yaml down
 }
 
 function main() {
@@ -177,8 +222,11 @@ function main() {
         python3 $WORKPATH/tests/chatqna_benchmark.py
     elif [ "${mode}" == "" ]; then
         validate_microservices
+        echo "==== microservices validated ===="
         validate_megaservice
-        # validate_frontend
+        echo "==== megaservice validated ===="
+        validate_frontend
+        echo "==== frontend validated ===="
     fi
 
     stop_docker
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
index 230b8a5d60..7d2858e41a 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -17,12 +17,12 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever nginx"
+    service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever vllm nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
@@ -40,8 +40,8 @@ function start_services() {
     docker compose -f compose_without_rerank.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
     n=0
     until [[ "$n" -ge 100 ]]; do
-        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
         sleep 5s
@@ -142,13 +142,13 @@ function validate_microservices() {
         "retriever-redis-server" \
         "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
 
-    # tgi for llm service
+    # vllm for llm service
     validate_service \
-        "${ip_address}:9009/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+        "${ip_address}:9009/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-service" \
+        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }
 
 function validate_megaservice() {
@@ -194,7 +194,7 @@ function validate_frontend() {
 
 function stop_docker() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose -f compose_without_rerank.yaml stop && docker compose -f compose_without_rerank.yaml rm -f
+    docker compose -f compose_without_rerank.yaml down
 }
 
 function main() {