diff --git a/prepare/metrics/llm_as_judge/llamaguard.py b/prepare/metrics/llm_as_judge/llamaguard.py index 75464515a..d23d87559 100644 --- a/prepare/metrics/llm_as_judge/llamaguard.py +++ b/prepare/metrics/llm_as_judge/llamaguard.py @@ -1,6 +1,7 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model_list = [ "meta-llama/llama-3-8b-instruct", @@ -11,7 +12,9 @@ task = "rating.single_turn" for model_id in model_list: - inference_model = IbmGenAiInferenceEngine(model_name=model_id, max_new_tokens=252) + inference_model = IbmGenAiInferenceEngine( + model_name=model_id, max_new_tokens=252, random_seed=get_seed() + ) model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py index 961e86c26..10e228d88 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py @@ -1,12 +1,15 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model = "meta-llama/llama-3-70b-instruct" format = "formats.llama3_instruct" template = "templates.response_assessment.rating.generic_single_turn" -inference_model = IbmGenAiInferenceEngine(model_name=model, max_new_tokens=252) +inference_model = IbmGenAiInferenceEngine( + model_name=model, max_new_tokens=252, random_seed=get_seed() +) model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py index 8a2e1815b..7d659ce31 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py @@ -1,6 +1,7 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"] format = "formats.llama3_instruct" @@ -8,7 +9,9 @@ task = "rating.single_turn" for model_id in model_list: - inference_model = IbmGenAiInferenceEngine(model_name=model_id, max_new_tokens=252) + inference_model = IbmGenAiInferenceEngine( + model_name=model_id, max_new_tokens=252, random_seed=get_seed() + ) model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json index d5cc8a9c2..f2eb862e5 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.generic_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json index a446726c5..0e53ebc40 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.generic_single_turn_with_reference", "task": "rating.single_turn_with_reference", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json index 562dc1782..397f4c20e 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json index 86ad3258f..7e6d7a5ea 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-8b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json index bf0e0c4cd..ba087faf1 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json index 33231da97..a40caf7b8 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-8b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn",