Skip to content

Commit

Permalink
fix merge main conflict
Browse files Browse the repository at this point in the history
Signed-off-by: letonghan <letong.han@intel.com>
  • Loading branch information
letonghan committed Sep 10, 2024
2 parents 35afbe6 + 94eb60f commit 90a3f4b
Show file tree
Hide file tree
Showing 36 changed files with 920 additions and 67 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/docker/compose/reranks-compose-cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,11 @@ services:
build:
dockerfile: comps/reranks/mosec/langchain/Dockerfile
image: ${REGISTRY:-opea}/reranking-langchain-mosec:${TAG:-latest}
reranking-mosec-neural-speed:
build:
dockerfile: comps/reranks/neural-speed/docker/Dockerfile
image: ${REGISTRY:-opea}/reranking-mosec-neural-speed:${TAG:-latest}
reranking-mosec-neural-speed-endpoint:
build:
dockerfile: comps/reranks/neural-speed/neuralspeed-docker/Dockerfile
image: ${REGISTRY:-opea}/reranking-mosec-neural-speed-endpoint:${TAG:-latest}
65 changes: 33 additions & 32 deletions .github/workflows/pr-dockerfile-path-scan.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
fi
- name: Check for changed Dockerfile paths in readme
if: always()
run: |
set -e
shopt -s globstar
Expand All @@ -75,6 +76,37 @@ jobs:
exit 1
fi
- name: Check new Dockerfile in compose yaml
if: always()
run: |
set -xe
shopt -s globstar
cd ${{github.workspace}}
no_add="FALSE"
merged_commit=$(git log -1 --format='%H')
changed_files="$(git diff --name-status --diff-filter=A ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
changed_yamls="$(git diff --name-status --diff-filter=AM ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/*.yaml**' | cut -f2)"
if [ -n "$changed_files" ]; then
for file in $changed_files; do
service=$(echo "$file" | awk -F '/' '{print $2}')
if find "${{github.workspace}}/.github/workflows/docker/compose/" -name "*$service*" |grep -q .; then
if [ -n "$changed_files" ] && grep -q $service'-compose-cd.yaml' <<< "$changed_yamls"; then
echo "The $file has been added to the ${{github.workspace}}/.github/workflows/docker/compose/"$service"-compose-cd.yaml."
else
echo "Please check if the added $file is included in the yaml under path ${{github.workspace}}/.github/workflows/docker/compose/"$service"-compose-cd.yaml."
no_add="TRUE"
fi
else
echo "Please create a new compose file named "$service"-compose-cd.yaml in ${{github.workspace}}/.github/workflows/docker/compose/ for $file and fill it in."
no_add="TRUE"
fi
done
fi
if [[ "$no_add" == "TRUE" ]]; then
exit 1
fi
Dockerfile-path-change-detection-in-GenAIExamples:
runs-on: ubuntu-latest
steps:
Expand All @@ -100,7 +132,7 @@ jobs:
is_use="FALSE"
used_files=""
merged_commit=$(git log -1 --format='%H')
changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile' | cut -f2)"
changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
if [ -n "$changed_files" ]; then
for file in $changed_files; do
matching_files=$(grep -rl "$file" ../GenAIExamples/**/*.md)
Expand All @@ -119,34 +151,3 @@ jobs:
echo "Please modify the corresponding README in GenAIExamples repo and ask suyue.chen@intel.com for final confirmation."
exit 1
fi
Dockerfile-addition-detection-in-GenAIComps:
runs-on: ubuntu-latest
steps:
- name: Clean Up Working Directory
run: sudo rm -rf ${{github.workspace}}/*

- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Check if the Dockerfile has been added
run: |
set -e
shopt -s globstar
cd ${{github.workspace}}
is_use="FALSE"
used_files=""
merged_commit=$(git log -1 --format='%H')
changed_files="$(git diff --name-status --diff-filter=A ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
if [ -n "$changed_files" ]; then
for file in $changed_files; do
if find "${{github.workspace}}/.github/workflows/docker/compose/" -name "*$(echo "$file" | awk -F '/' '{print $2}')*" |grep -q .; then
echo "Please check if the added $file is included in the yaml under path ${{github.workspace}}/.github/workflows/docker/compose/."
else
echo "Please create a new compose file named service_name-compose-cd.yaml in ${{github.workspace}}/.github/workflows/docker/compose/ for $file and fill it in."
fi
done
exit 1
fi
1 change: 1 addition & 0 deletions .github/workflows/pr-microservice-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ jobs:
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
PINECONE_KEY: ${{ secrets.PINECONE_KEY }}
PREDICTIONGUARD_API_KEY: ${{ secrets.PREDICTIONGUARD_API_KEY }}
service: ${{ matrix.service }}
hardware: ${{ matrix.hardware }}
run: |
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The initially supported `Microservices` are described in the below table. More `
<tr>
<td rowspan="2"><a href="./comps/embeddings/README.md">Embedding</a></td>
<td rowspan="2"><a href="https://www.langchain.com">LangChain</a>/<a href="https://www.llamaindex.ai">LlamaIndex</a></td>
<td rowspan="2"><a href="https://huggingface.co/BAAI/bge-large-en-v1.5">BAAI/bge-large-en-v1.5</a></td>
<td rowspan="2"><a href="https://huggingface.co/BAAI/bge-base-en-v1.5">BAAI/bge-base-en-v1.5</a></td>
<td><a href="https://github.com/huggingface/tei-gaudi">TEI-Gaudi</a></td>
<td>Gaudi2</td>
<td>Embedding on Gaudi2</td>
Expand All @@ -76,7 +76,7 @@ The initially supported `Microservices` are described in the below table. More `
<tr>
<td rowspan="2"><a href="./comps/reranks/README.md">Reranking</a></td>
<td rowspan="2"><a href="https://www.langchain.com">LangChain</a>/<a href="https://www.llamaindex.ai">LlamaIndex</a></td>
<td ><a href="https://huggingface.co/BAAI/bge-reranker-large">BAAI/bge-reranker-large</a></td>
<td ><a href="https://huggingface.co/BAAI/bge-reranker-base">BAAI/bge-reranker-base</a></td>
<td><a href="https://github.com/huggingface/tei-gaudi">TEI-Gaudi</a></td>
<td>Gaudi2</td>
<td>Reranking on Gaudi2</td>
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/redis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ First, you need to start a TEI service.

```bash
your_port=6006
model="BAAI/bge-large-en-v1.5"
docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model
model="BAAI/bge-base-en-v1.5"
docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model
```

Then you need to test your TEI service using the following commands:
Expand Down
2 changes: 1 addition & 1 deletion comps/dataprep/redis/langchain/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# Embedding model

EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-large-en-v1.5")
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")

# Redis Connection Information
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
Expand Down
14 changes: 13 additions & 1 deletion comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,16 @@ def load_json(json_path):
return content_list


def load_jsonl(jsonl_path):
"""Load and process jsonl file."""
content_list = []
with open(jsonl_path, "r") as file:
for line in file:
json_obj = json.loads(line)
content_list.append(json_obj)
return content_list


def load_yaml(yaml_path):
"""Load and process yaml file."""
with open(yaml_path, "r") as file:
Expand Down Expand Up @@ -351,8 +361,10 @@ def document_loader(doc_path):
return load_md(doc_path)
elif doc_path.endswith(".xml"):
return load_xml(doc_path)
elif doc_path.endswith(".json") or doc_path.endswith(".jsonl"):
elif doc_path.endswith(".json"):
return load_json(doc_path)
elif doc_path.endswith(".jsonl"):
return load_jsonl(doc_path)
elif doc_path.endswith(".yaml"):
return load_yaml(doc_path)
elif doc_path.endswith(".xlsx") or doc_path.endswith(".xls"):
Expand Down
2 changes: 1 addition & 1 deletion comps/embeddings/tei/langchain/local_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ def embedding(input: TextDoc) -> EmbedDoc:


if __name__ == "__main__":
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
opea_microservices["opea_service@local_embedding"].start()
2 changes: 1 addition & 1 deletion comps/embeddings/tei/llama_index/embedding_tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def embedding(input: TextDoc) -> EmbedDoc:


if __name__ == "__main__":
tei_embedding_model_name = os.getenv("TEI_EMBEDDING_MODEL_NAME", "BAAI/bge-large-en-v1.5")
tei_embedding_model_name = os.getenv("TEI_EMBEDDING_MODEL_NAME", "BAAI/bge-base-en-v1.5")
tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT", "http://localhost:8090")
embeddings = TextEmbeddingsInference(model_name=tei_embedding_model_name, base_url=tei_embedding_endpoint)
logger.info("TEI Gaudi Embedding initialized.")
Expand Down
2 changes: 1 addition & 1 deletion comps/embeddings/tei/llama_index/local_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ def embedding(input: TextDoc) -> EmbedDoc:


if __name__ == "__main__":
embeddings = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-large-en-v1.5")
embeddings = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-base-en-v1.5")
opea_microservices["opea_service@local_embedding"].start()
42 changes: 36 additions & 6 deletions comps/finetuning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_M

## 🚀3. Consume Finetuning Service

## 3.1 Upload a training file
### 3.1 Upload a training file

Download a training file, such as `alpaca_data.json` for instruction tuning and upload it to the server with below command, this file can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json):

Expand All @@ -97,9 +97,9 @@ curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-da

For reranking and embedding models finetuning, the training file [toy_finetune_data.jsonl](https://github.com/FlagOpen/FlagEmbedding/blob/master/examples/finetune/toy_finetune_data.jsonl) is an toy example.

## 3.2 Create fine-tuning job
### 3.2 Create fine-tuning job

### 3.2.1 Instruction Tuning
#### 3.2.1 Instruction Tuning

After a training file like `alpaca_data.json` is uploaded, use the following command to launch a finetuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:

Expand All @@ -114,7 +114,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
}'
```

### 3.2.2 Reranking Model Training
#### 3.2.2 Reranking Model Training

Use the following command to launch a finetuning job for reranking model finetuning, such as `BAAI/bge-reranker-large`:

Expand All @@ -133,7 +133,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
}'
```

### 3.2.3 Embedding Model Training
#### 3.2.3 Embedding Model Training

Use the following command to launch a finetuning job for embedding model finetuning, such as `BAAI/bge-base-en-v1.5`:

Expand Down Expand Up @@ -173,7 +173,33 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \

```

## 3.3 Manage fine-tuning job
#### 3.2.4 LLM Pretraining

Use the following command to launch a job for LLM pretraining, such as `meta-llama/Llama-2-7b-hf`:

```bash
# create a finetuning job
curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-X POST \
-H "Content-Type: application/json" \
-d '{
"training_file": "test_data.json",
"model": "meta-llama/Llama-2-7b-hf",
"General":{
"task":"pretraining",
"lora_config":null
}
}'
```

Below is an example for the format of the pretraining dataset:

```json
{"text": "A girl with a blue tank top sitting watching three dogs."}
{"text": "A boy with a blue tank top sitting watching three dogs."}
```

### 3.3 Manage fine-tuning job

Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.

Expand All @@ -191,6 +217,10 @@ curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type:
curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
```

### 3.4 Leverage fine-tuned model

After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../reranks/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../embeddings/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../llms/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.

## 🚀4. Descriptions for Finetuning parameters

We utilize [OpenAI finetuning parameters](https://platform.openai.com/docs/api-reference/fine-tuning) and extend it with more customizable parameters, see the definitions at [finetune_config](https://github.com/opea-project/GenAIComps/blob/main/comps/finetuning/finetune_config.py).
5 changes: 3 additions & 2 deletions comps/finetuning/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DEVICE_CPU = "cpu"
DEVICE_HPU = "hpu"
DEVICE_GPU = "gpu"
DEVICE_CUDA = "cuda"

ACCELERATE_STRATEGY_DDP = "DDP"
ACCELERATE_STRATEGY_FSDP = "FSDP"
Expand Down Expand Up @@ -57,7 +58,7 @@ def check_report_to(cls, v: str):

@validator("task")
def check_task(cls, v: str):
assert v in ["instruction_tuning", "rerank", "embedding"]
assert v in ["instruction_tuning", "pretraining", "rerank", "embedding"]
return v


Expand Down Expand Up @@ -136,7 +137,7 @@ class TrainingConfig(BaseModel):
def check_device(cls, v: str):
# will convert to lower case
if v:
assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU]
assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU, DEVICE_CUDA]
return v.lower()

@validator("hpu_execution_mode")
Expand Down
35 changes: 34 additions & 1 deletion comps/finetuning/llm_on_ray/finetune/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
IGNORE_INDEX = -100


class DataProcessor:
class InstructionDataProcessor:
# We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
def __init__(self, config, tokenizer):
self.tokenizer = tokenizer
Expand Down Expand Up @@ -202,6 +202,39 @@ def tokenize(self, examples):
return examples


class PretrainingDataProcessor:
def __init__(self, config, tokenizer):
self.tokenizer = tokenizer
self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
self.truncation = config["Dataset"].get("truncation", True)
self.padding = config["Dataset"].get("padding", True)

def tokenize(self, examples):
keys = list(examples.data.keys())
if len(keys) != 1 and "text" not in keys:
raise ValueError("Unsupported dataset format")

key = keys[0] if len(keys) == 1 else "text"
examples["input_ids"] = []
examples["labels"] = []
examples["attention_mask"] = []
for exp in examples[key]:
results = self.tokenizer(
exp,
padding=self.padding,
truncation=self.truncation,
return_tensors=None,
max_length=self.max_length,
)

input_ids = results["input_ids"]
labels = copy.deepcopy(input_ids)
examples["input_ids"].append(results["input_ids"])
examples["labels"].append(labels)
examples["attention_mask"].append(results["attention_mask"])
return examples


class TrainDatasetForCE(Dataset):
def __init__(self, dataset, args, tokenizer):
self.dataset = dataset
Expand Down
Loading

0 comments on commit 90a3f4b

Please sign in to comment.