fix merge main conflict

Signed-off-by: letonghan <letong.han@intel.com>
opea-project · Sep 10, 2024 · 90a3f4b · 90a3f4b
2 parents 35afbe6 + 94eb60f
commit 90a3f4b
Show file tree

Hide file tree

Showing 36 changed files with 920 additions and 67 deletions.
diff --git a/.github/workflows/docker/compose/reranks-compose-cd.yaml b/.github/workflows/docker/compose/reranks-compose-cd.yaml
@@ -14,3 +14,11 @@ services:
     build:
       dockerfile: comps/reranks/mosec/langchain/Dockerfile
     image: ${REGISTRY:-opea}/reranking-langchain-mosec:${TAG:-latest}
+  reranking-mosec-neural-speed:
+    build:
+      dockerfile: comps/reranks/neural-speed/docker/Dockerfile
+    image: ${REGISTRY:-opea}/reranking-mosec-neural-speed:${TAG:-latest}
+  reranking-mosec-neural-speed-endpoint:
+    build:
+      dockerfile: comps/reranks/neural-speed/neuralspeed-docker/Dockerfile
+    image: ${REGISTRY:-opea}/reranking-mosec-neural-speed-endpoint:${TAG:-latest}
diff --git a/.github/workflows/pr-dockerfile-path-scan.yaml b/.github/workflows/pr-dockerfile-path-scan.yaml
@@ -51,6 +51,7 @@ jobs:
           fi
 
       - name: Check for changed Dockerfile paths in readme
+        if: always()
         run: |
           set -e
           shopt -s globstar
@@ -75,6 +76,37 @@ jobs:
             exit 1
           fi
 
+      - name: Check new Dockerfile in compose yaml
+        if: always()
+        run: |
+          set -xe
+          shopt -s globstar
+          cd ${{github.workspace}}
+          no_add="FALSE"
+          merged_commit=$(git log -1 --format='%H')
+          changed_files="$(git diff --name-status --diff-filter=A ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
+          changed_yamls="$(git diff --name-status --diff-filter=AM ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/*.yaml**' | cut -f2)"
+          if [ -n "$changed_files" ]; then
+            for file in $changed_files; do
+              service=$(echo "$file" | awk -F '/' '{print $2}')
+              if find "${{github.workspace}}/.github/workflows/docker/compose/" -name "*$service*" |grep -q .; then
+                if [ -n "$changed_files" ] && grep -q $service'-compose-cd.yaml' <<< "$changed_yamls"; then
+                  echo "The $file has been added to the ${{github.workspace}}/.github/workflows/docker/compose/"$service"-compose-cd.yaml."
+                else
+                  echo "Please check if the added $file is included in the yaml under path ${{github.workspace}}/.github/workflows/docker/compose/"$service"-compose-cd.yaml."
+                  no_add="TRUE"
+                fi
+              else
+                echo "Please create a new compose file named "$service"-compose-cd.yaml in ${{github.workspace}}/.github/workflows/docker/compose/ for $file and fill it in."
+                no_add="TRUE"
+              fi
+            done
+          fi
+
+          if [[ "$no_add" == "TRUE" ]]; then
+            exit 1
+          fi
+
   Dockerfile-path-change-detection-in-GenAIExamples:
     runs-on: ubuntu-latest
     steps:
@@ -100,7 +132,7 @@ jobs:
           is_use="FALSE"
           used_files=""
           merged_commit=$(git log -1 --format='%H')
-          changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile' | cut -f2)"
+          changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
           if [ -n "$changed_files" ]; then
             for file in $changed_files; do
               matching_files=$(grep -rl "$file" ../GenAIExamples/**/*.md)
@@ -119,34 +151,3 @@ jobs:
             echo "Please modify the corresponding README in GenAIExamples repo and ask suyue.chen@intel.com for final confirmation."
             exit 1
           fi
-
-  Dockerfile-addition-detection-in-GenAIComps:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clean Up Working Directory
-        run: sudo rm -rf ${{github.workspace}}/*
-
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Check if the Dockerfile has been added
-        run: |
-          set -e
-          shopt -s globstar
-          cd ${{github.workspace}}
-          is_use="FALSE"
-          used_files=""
-          merged_commit=$(git log -1 --format='%H')
-          changed_files="$(git diff --name-status --diff-filter=A ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
-          if [ -n "$changed_files" ]; then
-            for file in $changed_files; do
-              if find "${{github.workspace}}/.github/workflows/docker/compose/" -name "*$(echo "$file" | awk -F '/' '{print $2}')*" |grep -q .; then
-                echo "Please check if the added $file is included in the yaml under path ${{github.workspace}}/.github/workflows/docker/compose/."
-              else
-                echo "Please create a new compose file named service_name-compose-cd.yaml in ${{github.workspace}}/.github/workflows/docker/compose/ for $file and fill it in."
-              fi
-            done
-            exit 1
-          fi
diff --git a/.github/workflows/pr-microservice-test.yml b/.github/workflows/pr-microservice-test.yml
@@ -47,6 +47,7 @@ jobs:
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
           GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
           PINECONE_KEY: ${{ secrets.PINECONE_KEY }}
+          PREDICTIONGUARD_API_KEY: ${{ secrets.PREDICTIONGUARD_API_KEY }}
           service: ${{ matrix.service }}
           hardware: ${{ matrix.hardware }}
         run: |

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ The initially supported `Microservices` are described in the below table. More `
 		<tr>
 			<td rowspan="2"><a href="./comps/embeddings/README.md">Embedding</a></td>
             <td rowspan="2"><a href="https://www.langchain.com">LangChain</a>/<a href="https://www.llamaindex.ai">LlamaIndex</a></td>
-			<td rowspan="2"><a href="https://huggingface.co/BAAI/bge-large-en-v1.5">BAAI/bge-large-en-v1.5</a></td>
+			<td rowspan="2"><a href="https://huggingface.co/BAAI/bge-base-en-v1.5">BAAI/bge-base-en-v1.5</a></td>
 			<td><a href="https://github.com/huggingface/tei-gaudi">TEI-Gaudi</a></td>
 			<td>Gaudi2</td>
 			<td>Embedding on Gaudi2</td>
@@ -76,7 +76,7 @@ The initially supported `Microservices` are described in the below table. More `
 		<tr>
 			<td rowspan="2"><a href="./comps/reranks/README.md">Reranking</a></td>
             <td rowspan="2"><a href="https://www.langchain.com">LangChain</a>/<a href="https://www.llamaindex.ai">LlamaIndex</a></td>
-			<td ><a href="https://huggingface.co/BAAI/bge-reranker-large">BAAI/bge-reranker-large</a></td>
+			<td ><a href="https://huggingface.co/BAAI/bge-reranker-base">BAAI/bge-reranker-base</a></td>
 			<td><a href="https://github.com/huggingface/tei-gaudi">TEI-Gaudi</a></td>
 			<td>Gaudi2</td>
 			<td>Reranking on Gaudi2</td>

diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md
@@ -49,8 +49,8 @@ First, you need to start a TEI service.
 
 ```bash
 your_port=6006
-model="BAAI/bge-large-en-v1.5"
-docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model
+model="BAAI/bge-base-en-v1.5"
+docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model
 ```
 
 Then you need to test your TEI service using the following commands:

diff --git a/comps/dataprep/redis/langchain/config.py b/comps/dataprep/redis/langchain/config.py
@@ -5,7 +5,7 @@
 
 # Embedding model
 
-EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-large-en-v1.5")
+EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
 
 # Redis Connection Information
 REDIS_HOST = os.getenv("REDIS_HOST", "localhost")

diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
@@ -285,6 +285,16 @@ def load_json(json_path):
     return content_list
 
 
+def load_jsonl(jsonl_path):
+    """Load and process jsonl file."""
+    content_list = []
+    with open(jsonl_path, "r") as file:
+        for line in file:
+            json_obj = json.loads(line)
+            content_list.append(json_obj)
+    return content_list
+
+
 def load_yaml(yaml_path):
     """Load and process yaml file."""
     with open(yaml_path, "r") as file:
@@ -351,8 +361,10 @@ def document_loader(doc_path):
         return load_md(doc_path)
     elif doc_path.endswith(".xml"):
         return load_xml(doc_path)
-    elif doc_path.endswith(".json") or doc_path.endswith(".jsonl"):
+    elif doc_path.endswith(".json"):
         return load_json(doc_path)
+    elif doc_path.endswith(".jsonl"):
+        return load_jsonl(doc_path)
     elif doc_path.endswith(".yaml"):
         return load_yaml(doc_path)
     elif doc_path.endswith(".xlsx") or doc_path.endswith(".xls"):

diff --git a/comps/embeddings/tei/langchain/local_embedding.py b/comps/embeddings/tei/langchain/local_embedding.py
@@ -40,5 +40,5 @@ def embedding(input: TextDoc) -> EmbedDoc:
 
 
 if __name__ == "__main__":
-    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
+    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     opea_microservices["opea_service@local_embedding"].start()
diff --git a/comps/embeddings/tei/llama_index/embedding_tei.py b/comps/embeddings/tei/llama_index/embedding_tei.py
@@ -31,7 +31,7 @@ def embedding(input: TextDoc) -> EmbedDoc:
 
 
 if __name__ == "__main__":
-    tei_embedding_model_name = os.getenv("TEI_EMBEDDING_MODEL_NAME", "BAAI/bge-large-en-v1.5")
+    tei_embedding_model_name = os.getenv("TEI_EMBEDDING_MODEL_NAME", "BAAI/bge-base-en-v1.5")
     tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT", "http://localhost:8090")
     embeddings = TextEmbeddingsInference(model_name=tei_embedding_model_name, base_url=tei_embedding_endpoint)
     logger.info("TEI Gaudi Embedding initialized.")

diff --git a/comps/embeddings/tei/llama_index/local_embedding.py b/comps/embeddings/tei/llama_index/local_embedding.py
@@ -31,5 +31,5 @@ def embedding(input: TextDoc) -> EmbedDoc:
 
 
 if __name__ == "__main__":
-    embeddings = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-large-en-v1.5")
+    embeddings = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-base-en-v1.5")
     opea_microservices["opea_service@local_embedding"].start()
diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md
@@ -86,7 +86,7 @@ docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_M
 
 ## 🚀3. Consume Finetuning Service
 
-## 3.1 Upload a training file
+### 3.1 Upload a training file
 
 Download a training file, such as `alpaca_data.json` for instruction tuning and upload it to the server with below command, this file can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json):
 
@@ -97,9 +97,9 @@ curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-da
 
 For reranking and embedding models finetuning, the training file [toy_finetune_data.jsonl](https://github.com/FlagOpen/FlagEmbedding/blob/master/examples/finetune/toy_finetune_data.jsonl) is an toy example.
 
-## 3.2 Create fine-tuning job
+### 3.2 Create fine-tuning job
 
-### 3.2.1 Instruction Tuning
+#### 3.2.1 Instruction Tuning
 
 After a training file like `alpaca_data.json` is uploaded, use the following command to launch a finetuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:
 
@@ -114,7 +114,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-### 3.2.2 Reranking Model Training
+#### 3.2.2 Reranking Model Training
 
 Use the following command to launch a finetuning job for reranking model finetuning, such as `BAAI/bge-reranker-large`:
 
@@ -133,7 +133,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-### 3.2.3 Embedding Model Training
+#### 3.2.3 Embedding Model Training
 
 Use the following command to launch a finetuning job for embedding model finetuning, such as `BAAI/bge-base-en-v1.5`:
 
@@ -173,7 +173,33 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
 
 ```
 
-## 3.3 Manage fine-tuning job
+#### 3.2.4 LLM Pretraining
+
+Use the following command to launch a job for LLM pretraining, such as `meta-llama/Llama-2-7b-hf`:
+
+```bash
+# create a finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "test_data.json",
+    "model": "meta-llama/Llama-2-7b-hf",
+    "General":{
+      "task":"pretraining",
+      "lora_config":null
+    }
+  }'
+```
+
+Below is an example for the format of the pretraining dataset:
+
+```json
+{"text": "A girl with a blue tank top sitting watching three dogs."}
+{"text": "A boy with a blue tank top sitting watching three dogs."}
+```
+
+### 3.3 Manage fine-tuning job
 
 Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.
 
@@ -191,6 +217,10 @@ curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type:
 curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
 ```
 
+### 3.4 Leverage fine-tuned model
+
+After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../reranks/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../embeddings/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../llms/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
+
 ## 🚀4. Descriptions for Finetuning parameters
 
 We utilize [OpenAI finetuning parameters](https://platform.openai.com/docs/api-reference/fine-tuning) and extend it with more customizable parameters, see the definitions at [finetune_config](https://github.com/opea-project/GenAIComps/blob/main/comps/finetuning/finetune_config.py).
diff --git a/comps/finetuning/finetune_config.py b/comps/finetuning/finetune_config.py
@@ -16,6 +16,7 @@
 DEVICE_CPU = "cpu"
 DEVICE_HPU = "hpu"
 DEVICE_GPU = "gpu"
+DEVICE_CUDA = "cuda"
 
 ACCELERATE_STRATEGY_DDP = "DDP"
 ACCELERATE_STRATEGY_FSDP = "FSDP"
@@ -57,7 +58,7 @@ def check_report_to(cls, v: str):
 
     @validator("task")
     def check_task(cls, v: str):
-        assert v in ["instruction_tuning", "rerank", "embedding"]
+        assert v in ["instruction_tuning", "pretraining", "rerank", "embedding"]
         return v
 
 
@@ -136,7 +137,7 @@ class TrainingConfig(BaseModel):
     def check_device(cls, v: str):
         # will convert to lower case
         if v:
-            assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU]
+            assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU, DEVICE_CUDA]
         return v.lower()
 
     @validator("hpu_execution_mode")

diff --git a/comps/finetuning/llm_on_ray/finetune/data_process.py b/comps/finetuning/llm_on_ray/finetune/data_process.py
@@ -18,7 +18,7 @@
 IGNORE_INDEX = -100
 
 
-class DataProcessor:
+class InstructionDataProcessor:
     # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
     def __init__(self, config, tokenizer):
         self.tokenizer = tokenizer
@@ -202,6 +202,39 @@ def tokenize(self, examples):
         return examples
 
 
+class PretrainingDataProcessor:
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
+        self.truncation = config["Dataset"].get("truncation", True)
+        self.padding = config["Dataset"].get("padding", True)
+
+    def tokenize(self, examples):
+        keys = list(examples.data.keys())
+        if len(keys) != 1 and "text" not in keys:
+            raise ValueError("Unsupported dataset format")
+
+        key = keys[0] if len(keys) == 1 else "text"
+        examples["input_ids"] = []
+        examples["labels"] = []
+        examples["attention_mask"] = []
+        for exp in examples[key]:
+            results = self.tokenizer(
+                exp,
+                padding=self.padding,
+                truncation=self.truncation,
+                return_tensors=None,
+                max_length=self.max_length,
+            )
+
+            input_ids = results["input_ids"]
+            labels = copy.deepcopy(input_ids)
+            examples["input_ids"].append(results["input_ids"])
+            examples["labels"].append(labels)
+            examples["attention_mask"].append(results["attention_mask"])
+        return examples
+
+
 class TrainDatasetForCE(Dataset):
     def __init__(self, dataset, args, tokenizer):
         self.dataset = dataset