Merge pull request #121 from amosproj/91_Benchmark_trained_model

Added benchmark
amosproj · Jul 17, 2024 · a4c52ad · a4c52ad
2 parents 89fd972 + 954d927
commit a4c52ad
Show file tree

Hide file tree

Showing 8 changed files with 553 additions and 0 deletions.
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_cncf.sbatch b/src/hpc_scripts/benchmark/benchmark/benchmark_cncf.sbatch
@@ -0,0 +1,49 @@
+#!/bin/bash -l
+#SBATCH --job-name=christian
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:a100:1 -p a100
+#SBATCH --output=R-%x.%j.out
+#SBATCH --error=R-%x.%j.err
+#SBATCH --mail-type=end,fail 
+#SBATCH --time=01:15:00
+#SBATCH --export=NONE
+unset SLURM_EXPORT_ENV
+
+# Set proxy to access internet from the node
+export http_proxy=http://proxy:80
+export https_proxy=http://proxy:80
+
+module purge
+module load python
+module load cuda
+module load cudnn
+
+# Conda
+conda activate amos_env # replace with the name of your conda env
+
+# Copy data to `$TMPDIR` to have faster access, recommended esp. for long trainings
+#cp -r "/home/janus/iwb6-datasets/FRAGMENTS" "$TMPDIR"
+# in case you have to extract an archive, e.g. a dataset use:
+cd "$TMPDIR"
+
+# create a temporary job dir on $WORK
+mkdir ${WORK}/$SLURM_JOB_ID
+
+# copy input file from location where job was submitted, and run 
+cp -r ${SLURM_SUBMIT_DIR}/benchmark_results_cncf.py .
+mkdir -p output/
+
+(
+  while true; do
+    nvidia-smi > ${SLURM_SUBMIT_DIR}/gpu_usage_${SLURM_JOB_ID}.log
+    sleep 60
+  done
+) &
+
+# Run training script (with data copied to node)
+torchrun benchmark_results_cncf.py
+
+# Create a directory on $HOME and copy the results from our training
+mkdir ${HOME}/$SLURM_JOB_ID
+cp -r output ${HOME}/$SLURM_JOB_ID
+
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_gemma.sbatch b/src/hpc_scripts/benchmark/benchmark/benchmark_gemma.sbatch
@@ -0,0 +1,49 @@
+#!/bin/bash -l
+#SBATCH --job-name=christian
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:a100:1 -p a100
+#SBATCH --output=R-%x.%j.out
+#SBATCH --error=R-%x.%j.err
+#SBATCH --mail-type=end,fail 
+#SBATCH --time=01:15:00
+#SBATCH --export=NONE
+unset SLURM_EXPORT_ENV
+
+# Set proxy to access internet from the node
+export http_proxy=http://proxy:80
+export https_proxy=http://proxy:80
+
+module purge
+module load python
+module load cuda
+module load cudnn
+
+# Conda
+conda activate amos_env # replace with the name of your conda env
+
+# Copy data to `$TMPDIR` to have faster access, recommended esp. for long trainings
+#cp -r "/home/janus/iwb6-datasets/FRAGMENTS" "$TMPDIR"
+# in case you have to extract an archive, e.g. a dataset use:
+cd "$TMPDIR"
+
+# create a temporary job dir on $WORK
+mkdir ${WORK}/$SLURM_JOB_ID
+
+# copy input file from location where job was submitted, and run 
+cp -r ${SLURM_SUBMIT_DIR}/benchmark_results_gemma.py .
+mkdir -p output/
+
+(
+  while true; do
+    nvidia-smi > ${SLURM_SUBMIT_DIR}/gpu_usage_${SLURM_JOB_ID}.log
+    sleep 60
+  done
+) &
+
+# Run training script (with data copied to node)
+torchrun benchmark_results_gemma.py
+
+# Create a directory on $HOME and copy the results from our training
+mkdir ${HOME}/$SLURM_JOB_ID
+cp -r output ${HOME}/$SLURM_JOB_ID
+
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_llama.sbatch b/src/hpc_scripts/benchmark/benchmark/benchmark_llama.sbatch
@@ -0,0 +1,49 @@
+#!/bin/bash -l
+#SBATCH --job-name=christian
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:a100:1 -p a100
+#SBATCH --output=R-%x.%j.out
+#SBATCH --error=R-%x.%j.err
+#SBATCH --mail-type=end,fail 
+#SBATCH --time=01:15:00
+#SBATCH --export=NONE
+unset SLURM_EXPORT_ENV
+
+# Set proxy to access internet from the node
+export http_proxy=http://proxy:80
+export https_proxy=http://proxy:80
+
+module purge
+module load python
+module load cuda
+module load cudnn
+
+# Conda
+conda activate amos_env # replace with the name of your conda env
+
+# Copy data to `$TMPDIR` to have faster access, recommended esp. for long trainings
+#cp -r "/home/janus/iwb6-datasets/FRAGMENTS" "$TMPDIR"
+# in case you have to extract an archive, e.g. a dataset use:
+cd "$TMPDIR"
+
+# create a temporary job dir on $WORK
+mkdir ${WORK}/$SLURM_JOB_ID
+
+# copy input file from location where job was submitted, and run 
+cp -r ${SLURM_SUBMIT_DIR}/benchmark_results_llama.py .
+mkdir -p output/
+
+(
+  while true; do
+    nvidia-smi > ${SLURM_SUBMIT_DIR}/gpu_usage_${SLURM_JOB_ID}.log
+    sleep 60
+  done
+) &
+
+# Run training script (with data copied to node)
+torchrun benchmark_results_llama.py
+
+# Create a directory on $HOME and copy the results from our training
+mkdir ${HOME}/$SLURM_JOB_ID
+cp -r output ${HOME}/$SLURM_JOB_ID
+
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_prometheus.py b/src/hpc_scripts/benchmark/benchmark/benchmark_prometheus.py
@@ -0,0 +1,68 @@
+from transformers import (AutoModelForCausalLM,
+                          AutoTokenizer,
+                          TrainingArguments,
+                          )
+import pandas as pd
+from datasets import load_dataset
+
+from prometheus_eval.vllm import VLLM
+from prometheus_eval import PrometheusEval
+from prometheus_eval.prompts import RELATIVE_PROMPT
+
+dataset = load_dataset("Kubermatic/Merged_QAs", split = "train[-50:]")
+gemma_answers = pd.read_csv("gemma_results.csv")
+cncf_answers = pd.read_csv("cncf_results.csv")
+llama_answers = pd.read_csv("llama_results.csv")
+
+model = VLLM(model="prometheus-eval/prometheus-7b-v2.0")
+judge = PrometheusEval(model=model, relative_grade_template=RELATIVE_PROMPT)
+
+import json
+
+# Create the data structure as per the given example
+data_dic = {
+    "metadata": [
+        {
+            "source_path": "First Run",
+            "custom_fields_schema": []
+        }
+    ],
+    "models": [
+        {"name": "Base Gemma Model 9B"},
+        {"name": "Finetuned Model 9B"}
+    ],
+    "examples": [
+    ]
+}
+for i in range(len(gemma_answers.index)):
+  data = {
+  "instruction": dataset["Question"][i],
+  "response_A": gemma_answers.iloc[i, 0],
+  "response_B": cncf_answers.iloc[i, 0],
+  "reference_answer": f"{dataset['Question'][i]} \n{dataset['Answer'][i]}",
+  "rubric": "Which is the better answer to the question taking into account the reference answer?"
+}
+
+  feedback, score = judge.single_relative_grade(**data)
+  if score == 'A':
+    score = -1
+  else:
+    score = 1
+  example = {
+            "input_text": dataset["Question"][i],
+            "tags": ["CNCF"],  # A list of keywords for categorizing prompts
+            "output_text_a": gemma_answers.iloc[i, 0],
+            "output_text_b": cncf_answers.iloc[i, 0],
+            "score": score,  # Score from the judge LLM
+            "individual_rater_scores": [],
+            "custom_fields": {}
+          }
+  data_dic["examples"].append(example)
+
+  print("Feedback:", feedback)
+  print("Score:", score)
+
+file_path = "output/prometheus.json"
+
+with open(file_path, 'w') as json_file:
+  json.dump(data_dic, json_file, indent=4)
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_prometheus.sbatch b/src/hpc_scripts/benchmark/benchmark/benchmark_prometheus.sbatch
@@ -0,0 +1,52 @@
+#!/bin/bash -l
+#SBATCH --job-name=christian
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:a100:1 -p a100
+#SBATCH --output=R-%x.%j.out
+#SBATCH --error=R-%x.%j.err
+#SBATCH --mail-type=end,fail 
+#SBATCH --time=01:15:00
+#SBATCH --export=NONE
+unset SLURM_EXPORT_ENV
+
+# Set proxy to access internet from the node
+export http_proxy=http://proxy:80
+export https_proxy=http://proxy:80
+
+module purge
+module load python
+module load cuda
+module load cudnn
+
+# Conda
+conda activate amos_env # replace with the name of your conda env
+
+# Copy data to `$TMPDIR` to have faster access, recommended esp. for long trainings
+#cp -r "/home/janus/iwb6-datasets/FRAGMENTS" "$TMPDIR"
+# in case you have to extract an archive, e.g. a dataset use:
+cd "$TMPDIR"
+
+# create a temporary job dir on $WORK
+mkdir ${WORK}/$SLURM_JOB_ID
+
+# copy input file from location where job was submitted, and run 
+cp -r ${SLURM_SUBMIT_DIR}/benchmark_prometheus.py .
+cp -r ${SLURM_SUBMIT_DIR}/cncf_results.csv .
+cp -r ${SLURM_SUBMIT_DIR}/gemma_results.csv .
+cp -r ${SLURM_SUBMIT_DIR}/llama_results.csv .
+mkdir -p output/
+
+(
+  while true; do
+    nvidia-smi > ${SLURM_SUBMIT_DIR}/gpu_usage_${SLURM_JOB_ID}.log
+    sleep 60
+  done
+) &
+
+# Run training script (with data copied to node)
+python benchmark_prometheus.py
+
+# Create a directory on $HOME and copy the results from our training
+mkdir ${HOME}/$SLURM_JOB_ID
+cp -r output ${HOME}/$SLURM_JOB_ID
+
diff --git a/src/hpc_scripts/benchmark/benchmark/benchmark_results_cncf.py b/src/hpc_scripts/benchmark/benchmark/benchmark_results_cncf.py
@@ -0,0 +1,80 @@
+from huggingface_hub import HfApi, login
+import os
+HF_TOKEN = os.getenv('HF_TOKEN', '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')
+api = HfApi()
+login(HF_TOKEN, add_to_git_credential=True)
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import transformers
+from datasets import load_dataset
+import torch
+import re
+from tqdm import tqdm
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from multiprocessing import freeze_support
+from peft import PeftModel, PeftConfig
+
+
+import csv
+import gc
+import traceback
+NUM_GPUS = 1
+dataset = load_dataset("Kubermatic/Merged_QAs", split="train[-50:]")
+
+# Function to run inference
+def run_inference(rank, world_size, data_length):
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+    config = PeftConfig.from_pretrained("Kubermatic/DeepCNCF9BAdapter")
+    base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b-it", device_map=f"cuda:{rank}")
+
+    model = PeftModel.from_pretrained(base_model, "Kubermatic/DeepCNCF9BAdapter", device_map=f"cuda:{rank}")
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
+    model.eval()
+
+    start_index = int(rank * data_length / world_size) 
+    end_index = data_length if rank == NUM_GPUS else int((rank + 1) * data_length / world_size - 1) 
+
+    with torch.no_grad():
+        for i in tqdm(range(start_index, end_index)):
+            question = dataset['Question'][i]
+
+            try:
+                chat = [
+                    { "role": "user", "content": question},
+                ]
+                prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+                input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to("cuda")
+
+                outputs = model.generate(**inputs,
+                                 max_new_tokens=512,
+                                 do_sample=True)
+                result = tokenizer.decode(outputs[0])
+                print(result)
+
+                with open(f"output/benchmark_results{rank}.csv", 'a+', newline='') as file:
+                    write = csv.writer(file)
+                    write.writerow([result])
+
+                # Clean up to free memory
+                del  question_text, result, question, answer
+                torch.cuda.empty_cache()
+                gc.collect()
+            except Exception as error:
+                print("An error occurred:", type(error).__name__, "–", error, flush = True)
+                torch.cuda.empty_cache()
+                gc.collect()
+
+    del model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+# Main script
+if __name__ == '__main__':
+    freeze_support()
+    data_length = len(dataset['Question'])
+    mp.spawn(run_inference,
+             args=(NUM_GPUS, data_length),
+             nprocs=NUM_GPUS,
+             join=True)