diff --git a/.github/workflows/catalog_consistency.yml b/.github/workflows/catalog_consistency.yml index a10b44c62..951781af4 100644 --- a/.github/workflows/catalog_consistency.yml +++ b/.github/workflows/catalog_consistency.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: '3.9' - + - run: curl -LsSf https://astral.sh/uv/install.sh | sh - run: uv pip install --system -e ".[tests]" diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 14cb014fd..fb4f0066d 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: '3.9' - + - run: curl -LsSf https://astral.sh/uv/install.sh | sh - run: uv pip install --system ".[tests]" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c5e92e2cc..07ba6d13a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,10 +5,16 @@ repos: # Ruff version. rev: v0.1.6 hooks: - # Run the linter. + # Run the linter on all files except the specific one - id: ruff - args: [ --fix ] - # Run the formatter. + args: [--fix] + exclude: src/unitxt/metrics.py + # Run the linter on the specific file with the ignore flag + - id: ruff + name: ruff (src/unitxt/metrics.py) + files: src/unitxt/metrics.py + args: [--fix, --ignore, C901] + # Run the formatter - id: ruff-format - repo: https://github.com/ibm/detect-secrets diff --git a/.secrets.baseline b/.secrets.baseline index 32b037230..32eb690d7 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2024-07-09T07:07:12Z", + "generated_at": "2024-07-29T09:03:34Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -82,7 +82,7 @@ "hashed_secret": "fa172616e9af3d2a24b5597f264eab963fe76889", "is_secret": false, "is_verified": false, - "line_number": 1531, + "line_number": 1841, "type": "Hex High Entropy String", "verified_result": null } diff --git a/docs/docs/adding_dataset.rst b/docs/docs/adding_dataset.rst index d82b255d1..ac648e49b 100644 --- a/docs/docs/adding_dataset.rst +++ b/docs/docs/adding_dataset.rst @@ -2,13 +2,13 @@ .. note:: - To use this tutorial, you need to :ref:`install unitxt `. + To use this tutorial, you need to :ref:`install Unitxt `. ================= Datasets ✨ ================= -This guide will assist you in adding or using your new dataset in unitxt. +This guide will assist you in adding or using your new dataset in Unitxt. The information needed for loading your data will be defined in :class:`TaskCard ` class: @@ -46,9 +46,9 @@ We will use the `bleu` metric for a reference based evaluation. .. code-block:: python task=Task( - input_fields= { "text" : "str", "source_language" : "str", "target_language" : "str"}, - reference_fields= {"translation" : "str"}, - prediction_type="str", + input_fields= { "text" : str, "source_language" : str, "target_language" : str}, + reference_fields= {"translation" : str}, + prediction_type=str, metrics=["metrics.bleu"], ), diff --git a/docs/docs/adding_metric.rst b/docs/docs/adding_metric.rst index 5ee74514c..29022c27c 100644 --- a/docs/docs/adding_metric.rst +++ b/docs/docs/adding_metric.rst @@ -18,17 +18,18 @@ You specify the metrics metrics in the Task. For example: .. code-block:: python - task = Task( - input_fields={ "question" : "str" }, - reference_fields={ "answer" : str }, - prediction_type="str", - metrics=[ - "metrics.rouge", - "metrics.normalized_sacrebleu", - "metrics.bert_score.deberta_xlarge_mnli", - "metrics.bert_score.deberta_large_mnli" - ], - ) + + task = Task( + input_fields={"question" : str}, + reference_fields={"answer" : str}, + prediction_type=str, + metrics=[ + "metrics.rouge", + "metrics.normalized_sacrebleu", + "metrics.bert_score.deberta_xlarge_mnli", + "metrics.bert_score.deberta_large_mnli" + ], + ) You can see the full list of built in metrics :ref:`Metrics section `. In this section we will understand Unitxt metrics and learn how to add new metrics. @@ -49,8 +50,8 @@ string class names as predictions. The post processor may convert the string o (e.g. by splitting using a separator). 2. **References** (`references` - optional): This is a list of gold references, from the same type of the prediction. -For example, if the prediction is a string, the references field are a list of strings. If the prediction is -a list of strings (e.g in multi-label classification), then the references field is a *list* of lists of strings. +For example, if the prediction is a string, the references field are a list of strings. If the prediction is +a list of strings (e.g in multi-label classification), then the references field is a *list* of lists of strings. The metric should return a perfect score, if the prediction is equal to one of the references. 3. **Task data** (`task_data` - optional) - all the input and output fields of a task as a dictionary. @@ -72,8 +73,8 @@ Metric Outputs By default, each metric provides scores for each instance separately and global aggregated scores over all instances together. The output of the metrics is a nested dictionary per instance. -The scores calculated on instance `i` by itself are found in `results[i]["score"]["instance"]`. -The global scores calculated over all instances are found in `results[i]["score"]["global"]`. +The scores calculated on instance `i` by itself are found in `results[i]["score"]["instance"]`. +The global scores calculated over all instances are found in `results[i]["score"]["global"]`. Note the global scores are the same in all instances, so usually `results[0]["score"]["global"]` is used to get the global scores. A metric could return multiple scores, but it should always return a field called `score` with the main score of the metric, @@ -92,8 +93,8 @@ For example, the score list for an instance could be: The global scores are calculated over all instances. Metrics can also calculate confidence intervals for the global scores. -This gives you an assessment of the inherient noise in the scores. When you compare runs on same data, check if their confidence -intervals overlap. If so, the difference may not be statistically significant. +This gives you an assessment of the inherient noise in the scores. When you compare runs on same data, check if their confidence +intervals overlap. If so, the difference may not be statistically significant. .. code-block:: python @@ -111,7 +112,7 @@ Metric Outputs with Multiple Metrics ------------------------------------- When multiple metrics are specified, their scores are appended to the score list. -If multiple metrics have the same score names, the score of the metric that appears first in the metrics list has precedence. +If multiple metrics have the same score names, the score of the metric that appears first in the metrics list has precedence. If you want to avoid the scores being overwritten by other metrics, you can add a prefix to each metric score. @@ -127,7 +128,7 @@ If you want to avoid the scores being overwritten by other metrics, you can add ) Note that the ``score`` and ``score_names`` are always taken from the first metric in the metric list. - + Metric Base Classes ------------------- @@ -139,7 +140,7 @@ scores are calculated. ``InstanceMetric` - Class for metrics in which the global scores are be calculated by aggregating the instance scores. Typically, the global score is the average of all instance scores. `InstanceMetric` first evaluates each instance separately, -and then aggregate the instances score. Some examples of instance metrics are `Accuracy`, `TokenOverlap`, `CharEditDistance`. +and then aggregate the instances score. Some examples of instance metrics are `Accuracy`, `TokenOverlap`, `CharEditDistance`. ``BulkInstanceMetric`` - Similar to ``InstanceMetric`` , it is for metrics in which the globals score can be calculated by aggregating the instance scores. However, due to implementation efficiently reasons, it's better to run them in bulk (for example, when using LLMs during score calculations). @@ -147,11 +148,11 @@ due to implementation efficiently reasons, it's better to run them in bulk (for Some examples of bulk instance metrics are `SentenceBert`, `Reward`. ``GlobalMetric`` - Class for metrics for which the global scores must be calculated over all the instances together. -Some examples of global metrics are `f1`, `Spearman`, `Kendall Tau`. Note that by default global metrics are executed once per instance -to generate per instance scores, and then once again over all instances together. So if there are 100 instances, -it will first be called 100 times , each on a single instance, and then one time on all 100 instances. +Some examples of global metrics are `f1`, `Spearman`, `Kendall Tau`. Note that by default global metrics are executed once per instance +to generate per instance scores, and then once again over all instances together. So if there are 100 instances, +it will first be called 100 times , each on a single instance, and then one time on all 100 instances. -Instance scores of `GlobalMetrics` are useful for error-analysis. Consider f1 score, for example. +Instance scores of `GlobalMetrics` are useful for error-analysis. Consider f1 score, for example. It can be calculated only on all instances together. Yet it is useful to report the score of every instance so you can see that good instances get f1 score of 1 and bad ones get 0. @@ -163,14 +164,14 @@ so you can see that good instances get f1 score of 1 and bad ones get 0. Adding a New Instance metric ---------------------------- - Assume we want to create a referenceless metric for the task of adding two numbers. - It will take the processed prediction of the task (an integer) and compare to the sum of the + Assume we want to create a referenceless metric for the task of adding two numbers. + It will take the processed prediction of the task (an integer) and compare to the sum of the two task input fields `num1` and `num2`. It will check, for each instance, how close the predicted sum is to the actual sum. - The metric can be configured with a `relative_tolerance` threshold for approximate comparison. - If the difference between the prediction and actual result is smaller than the `relative_tolerance` + The metric can be configured with a `relative_tolerance` threshold for approximate comparison. + If the difference between the prediction and actual result is smaller than the `relative_tolerance` threshold, the instance score is 1. Otherwise, the instance result is 0. - The global accuracy result is the mean of the instance scores. + The global accuracy result is the mean of the instance scores. .. code-block:: python @@ -179,7 +180,7 @@ Adding a New Instance metric main_score = "sum_accuracy" # name of the main score reduction_map = {"mean": ["sum_accuracy"]} # defines that the global score is a mean of the instance scores ci_scores = ["sum_accuracy"] # define that confidence internal should be calculated on the score - prediction_type = "int" # the metric expect the prediction as an int + prediction_type = int # the metric expect the prediction as an int # Relation tolerance for errors by default it is 0, but can be changed for approximate comparison relative_tolerance : float = 0 @@ -253,15 +254,15 @@ This is a global metric because it performs the calculation over all the instanc The score is negative (up to -1), if predictions tend to be less accurate when reference values are larger. The score is close to 0, if the magnitude of the reference answer does not correlate with accuracy. - The score is positive (up to 1), if predictions tend to be less accurate when reference values are smaller. + The score is positive (up to 1), if predictions tend to be less accurate when reference values are smaller. In most realistic cases, the score is likely to be zer or negative. """ - prediction_type = "int" + prediction_type = int main_score="sensitivity_to_numeric_magnitude" single_reference_per_prediction = True # validates only one reference is passed per prediction - + def compute( self, references: List[List[int]], predictions: List[int], task_data: List[Dict] ) -> dict: @@ -277,9 +278,9 @@ This is a global metric because it performs the calculation over all the instanc 1. Calculating confidence intervals for global metrics can be costly if each invocation of the metric takes a long time. To avoid calculating confidence internals for global metrics set `n_resamples = 0`. -2. Unitxt calculates instance results in global metrics to allow viewing the output on a single instances. +2. Unitxt calculates instance results in global metrics to allow viewing the output on a single instances. This can help ensure metric behavior is correct, because it can be checked on single instance. -However, sometimes it does not make sense because the global metric assumes a minimum amount of instances. +However, sometimes it does not make sense because the global metric assumes a minimum amount of instances. The per instance calculations can be disabled by setting `process_single_instances = False`. Managing Metric Dependencies @@ -340,11 +341,11 @@ This is done using the predefined HuggingfaceMetric class. metric = HuggingfaceMetric( hf_metric_name="bleu", # The name of the metric in huggingface main_score="bleu", # The main score (assumes the metric returns this score name) - prediction_type="str" # The type of the prediction and references (note that by default references are a list of the prediction_type) + prediction_type=str # The type of the prediction and references (note that by default references are a list of the prediction_type) ) add_to_catalog(metric, "metrics.bleu", overwrite=True) -By default, the HuggingfaceMetric wrapper passes the only the `predictions` and `references` fields to +By default, the HuggingfaceMetric wrapper passes the only the `predictions` and `references` fields to the metrics. You can also pass fields from the task_data inputs, by specifying `hf_additional_input_fields`. For example: @@ -352,10 +353,10 @@ For example: metric = HuggingfaceMetric( ... - hf_additional_input_fields_pass = ["num1","num2"], # passes the task's num1 and num2 fields + hf_additional_input_fields_pass = ["num1","num2"], # passes the task's num1 and num2 fields ... - - ) + + ) In the above example, the `num1` and `num2`fields are passed as lists of values to the metric (each element in the list corresponds to an instance). If you want to pass a scalar (single) value to the metric @@ -367,13 +368,13 @@ you can use: ... hf_additional_input_fields_pass_one_value=["tokenize"], ... - ) - + ) + This assumes the field has the same value is in all instances. Note that Huggingface metrics are independent from the tasks they are used for, and receive arbitrary types of predictions, references, and additional parameters. It may be need to map between unitxt field names, values and types to the corresponding interface of the metric, using -the `MetricPipeline` described in the previous section. +the `MetricPipeline` described in the previous section. diff --git a/docs/docs/adding_task.rst b/docs/docs/adding_task.rst index b09a52c83..4e51660e0 100644 --- a/docs/docs/adding_task.rst +++ b/docs/docs/adding_task.rst @@ -25,9 +25,9 @@ The task is formally defined as: from unitxt.blocks import Task task = Task( - input_fields={"num1" : "int", "num2" : "int"}, - reference_fields={"sum" : "int"}, - prediction_type="int", + input_fields={"num1" : int, "num2" : int}, + reference_fields={"sum" : int}, + prediction_type=int, metrics=[ "metrics.sum_accuracy", "metrics.sum_accuracy_approximate" diff --git a/docs/docs/adding_template.rst b/docs/docs/adding_template.rst index b61cd3e49..aa870d7c0 100644 --- a/docs/docs/adding_template.rst +++ b/docs/docs/adding_template.rst @@ -77,30 +77,32 @@ Making Your Custom Template ---------------------------- In order to make your own template, you need to create a class inheriting from `Template` and -implementing its two abstract methods: +implementing its abstract methods: .. code-block:: python - @abstractmethod - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: + @abstractmethod + def input_fields_to_source(self, input_fields: Dict[str, object]) -> str: + """Create the textual input for the model from the input fields""" pass @abstractmethod - def outputs_to_target_and_references( - self, outputs: Dict[str, object] - ) -> Tuple[str, List[str]]: + def reference_fields_to_target_and_references(self, reference_fields: Dict[str, object]) -> Tuple[str, List[str]]: + """Create a list of references from the reference fields. Also returns one of the references + as the 'target' - the reference used if the instance is used as a demonstration." pass -For instance: + + +For instance, this templates passes all the input fields to the model as a json string. +It also formats the references , by taking two of the dataset reference fields the 'top_answer' and 'alternative_answer'. .. code-block:: python class MyCustomTemplate(Template): - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: - return str(inputs) # use all the task inputs fields in their dictionary look - - def outputs_to_target_and_references( - self, outputs: Dict[str, object] - ) -> Tuple[str, List[str]]: - return outputs["label"], [outputs["label"]] + def input_fields_to_source(self, inputs_fields: Dict[str, object]) -> str: + return json.dumps(inputs_fields) # provide the json string with all fields as the input to the model + def reference_fields_to_target_and_references(self, reference_fields: Dict[str, object]) -> Tuple[str, List[str]] + return outputs_fields["top_answer"], # target + [outputs_fields["top_answer"],outputs_fields["alternative_answer"]] # all references diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst index 4f3279e6c..573bcb30d 100644 --- a/docs/docs/examples.rst +++ b/docs/docs/examples.rst @@ -57,6 +57,18 @@ Demonstrates how different formats and system prompts effect the input provided Related documentation: :ref:`Formatting tutorial `. +Evaluate the impact of different demonstration example selections ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Demonstrates how different methods of selecting the demonstrations in in-context learning affect the results. +Three methods are considered: fixed selection of example demonstrations for all test instance, +random selection of example demonstrations for each test instance, +and choosing the demonstration examples most (lexically) similar to each test instance. + +`Example code `_ + +Related documentation: :ref:`Formatting tutorial `. + LLM as Judges -------------- @@ -93,7 +105,7 @@ Evaluate the quality of an LLM as judge Demonstrates how to evaluate an LLM as judge by checking its scores using the gold references of a dataset. It checks if the judge consistently prefers correct outputs over clearly wrong ones. -Note that to check the the ability of the LLM as judge to discern sutble differences between +Note that to check the the ability of the LLM as judge to discern suitable differences between partially correct answers requires more refined tests and corresponding labeled data. The example shows an 8b llama based judge is not a good judge for a summarization task, while the 70b model performs much better. @@ -103,3 +115,35 @@ while the 70b model performs much better. Related documentation: :ref:`LLM as a Judge Metrics Guide `. +Evaluate your model on the Arena Hard benchmark using a custom LLMaJ +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Demonstrates how to evaluate a user model on the Arena Hard benchmark, using an LLMaJ other than the GPT4. + +`Example code `_ + +Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark `. + +Evaluate a judge model performance judging the Arena Hard Benchmark +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Demonstrates how to evaluate the capabilities of a user model, to act as a judge on the Arena Hard benchmark. +The model is evaluated on its capability to give a judgment that is in correlation with GPT4 judgment on the benchmark. + +`Example code `_ + +Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark `. + +Evaluate using ensemble of LLM as a judge metrics +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Demonstrates how to create a metric which is an ensemble of LLM as a judge metrics. +The example shows how to ensemble two judges which uses different templates. + +`Example code `_ + +Related documentation: :ref:`LLM as a Judge Metrics Guide `. + + + + diff --git a/docs/docs/introduction.rst b/docs/docs/introduction.rst index 1d85817b3..92f454543 100644 --- a/docs/docs/introduction.rst +++ b/docs/docs/introduction.rst @@ -16,25 +16,25 @@ Unitxt deconstructs the data preparations and evaluation flows into modular comp Key Capabilities: -- Built in support for a variety of NLP tasks, including ones not typically found in other frameworks, such as multi label classification, targeted sentiment analysis, entity and relation extraction, table understanding, and retrieval augmented generation. +- Built-in support for a variety of NLP tasks, including ones not typically found in other frameworks, such as multi label classification, targeted sentiment analysis, entity and relation extraction, table understanding, and retrieval augmented generation -- Support for changing templates and formats. +- Support for changing templates and formats -- Supports loading data from different datasources (e.g Local files, Huggingface, Cloud Storage, Kaggle ) +- Support for loading data from different datasources (e.g., local files, Hugging Face, cloud storage, Kaggle) -- Large collection of metrics (including LLM as Judges) +- Large collection of metrics (including LLMs as Judges) -- Compatible with Huggingface Dataset and Metric APIs and can be used without installation +- Compatible with Hugging Face Dataset and Metric APIs without needing any installation -- The same Unitxt data preparation pipeline can be used in evaluation and during inference in production systems +- The same Unitxt data preparation pipeline can be used for both evaluation and inference in production systems -- Removes the requirement to run user python code in dataset processing - reducing security risks +- Removes the requirement to run user Python code in dataset processing, reducing security risks -Unitxt can be used in standalone code, and is also integrated into common libraries and evaluation frameworks such as -`HuggingFace`_, `Helm`_, `LM-eval-harness`_. +Unitxt can be used as standalone code. It can also be integrated with common libraries and evaluation frameworks such as +`HuggingFace`_, `Helm`_, and `LM-eval-harness`_. -To get started, can explore the Unitxt :ref:`catalog `, and then see how you can load a :ref:`dataset` and :ref:`evaluate ` it in a just a few lines of code. -Finally, you can then learn how to :ref:`add new datasets `. +To get started, you can explore the Unitxt :ref:`catalog `. Learn how you can load a :ref:`dataset` and :ref:`evaluate ` it in a just a few lines of code. +You can then learn how to :ref:`add new datasets `. Beyond being a tool, Unitxt is a community-driven platform, empowering users to build, share, and advance their pipelines collaboratively. diff --git a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py new file mode 100644 index 000000000..c8d91f9fa --- /dev/null +++ b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py @@ -0,0 +1,35 @@ +from unitxt import evaluate, load_dataset +from unitxt.inference import MockInferenceEngine +from unitxt.text_utils import print_dict + +model_id = "meta-llama/llama-3-70b-instruct" +model_format = "formats.llama3_instruct" + +""" +We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly. +The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations. +""" +dataset = load_dataset( + card="cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge", + template="templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + format=model_format, +)["test"].select(range(4)) + +inference_model = MockInferenceEngine(model_name=model_id) +""" +We are using a mock inference engine (and model) in order for the example to finish quickly. +In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following: +from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine) +and switch them with the MockInferenceEngine class in the example. +For the arguments these inference engines can receive, please refer to the classes documentation. + +Example of using an IBM model: +from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin) +params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42) +inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params) +""" + +predictions = inference_model.infer(dataset) +scores = evaluate(predictions=predictions, data=dataset) + +print_dict(scores[0]["score"]["global"]) diff --git a/examples/evaluate_a_model_using_arena_hard.py b/examples/evaluate_a_model_using_arena_hard.py new file mode 100644 index 000000000..ce42fc38f --- /dev/null +++ b/examples/evaluate_a_model_using_arena_hard.py @@ -0,0 +1,38 @@ +from unitxt import evaluate, load_dataset +from unitxt.inference import MockInferenceEngine +from unitxt.text_utils import print_dict + +model_id = "meta-llama/llama-3-70b-instruct" +model_format = "formats.llama3_instruct" + +""" +We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly. +The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations. +""" +dataset = load_dataset( + card="cards.arena_hard.generation.english_gpt_4_0314_reference", + template="templates.empty", + format=model_format, + metrics=[ + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling" + ], +)["test"].select(range(4)) + +inference_model = MockInferenceEngine(model_name=model_id) +""" +We are using a mock inference engine (and model) in order for the example to finish quickly. +In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following: +from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine) +and switch them with the MockInferenceEngine class in the example. +For the arguments these inference engines can receive, please refer to the classes documentation. + +Example of using an IBM model: +from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin) +params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42) +inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params) +""" + +predictions = inference_model.infer(dataset) +scores = evaluate(predictions=predictions, data=dataset) + +print_dict(scores[0]["score"]["global"]) diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py new file mode 100644 index 000000000..9dbb51ac3 --- /dev/null +++ b/examples/evaluate_different_demo_selections.py @@ -0,0 +1,62 @@ +import pandas as pd +from unitxt import get_logger +from unitxt.api import evaluate, load_dataset +from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.splitters import CloseTextSampler, FixedIndicesSampler, RandomSampler +from unitxt.text_utils import print_dict + +logger = get_logger() + +# This examples evaluates different kinds of demo selection strategies on a classification task. +# The different strategies are evaluates in 1,3,5 shots. The examples are selected from a demo pool of 100 examples. +# RandomSampler - randomly sample a different set of examples for each test instance +# CloseTextSampler - select the lexically closest amples from the demo pool for each test instance +# FixedIndicesSampler - selec the same fixed set of demo examples for all instances + +card = "cards.ledgar" +model_name = "google/flan-t5-xxl" +inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) + + +df = pd.DataFrame(columns=["num_demos", "sampler", "f1_micro", "ci_low", "ci_high"]) + +for num_demos in [1, 3, 5]: + for demo_sampler in [ + RandomSampler(), + CloseTextSampler(field="text"), + FixedIndicesSampler(indices=[0, 1, 2, 4, 5]), + ]: + dataset = load_dataset( + card=card, + template="templates.classification.multi_class.title", + num_demos=num_demos, + demos_pool_size=300, + loader_limit=400, + max_test_instances=200, + sampler=demo_sampler, + ) + + test_dataset = dataset["test"] + + predictions = inference_model.infer(test_dataset) + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + + logger.info( + f"Sample input and output for sampler {demo_sampler} and num_demos '{num_demos}':" + ) + print_dict( + evaluated_dataset[0], + keys_to_print=["source", "prediction", "processed_prediction"], + ) + global_scores = evaluated_dataset[0]["score"]["global"] + + df.loc[len(df)] = [ + num_demos, + demo_sampler.to_json(), + global_scores["score"], + global_scores["score_ci_low"], + global_scores["score_ci_high"], + ] + + df = df.round(decimals=2) + logger.info(df.to_markdown()) diff --git a/examples/evaluate_using_metrics_ensemble.py b/examples/evaluate_using_metrics_ensemble.py new file mode 100644 index 000000000..ee99ec8de --- /dev/null +++ b/examples/evaluate_using_metrics_ensemble.py @@ -0,0 +1,50 @@ +from unitxt import get_logger +from unitxt.api import evaluate, load_dataset +from unitxt.inference import ( + HFPipelineBasedInferenceEngine, +) +from unitxt.metrics import MetricsEnsemble +from unitxt.text_utils import print_dict + +logger = get_logger() + +# define the metrics ensemble +ensemble_metric = MetricsEnsemble( + metrics=[ + "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn", + "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn", + ], + weights=[0.75, 0.25], +) +# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog. +# We set loader_limit to 20 to reduce download time. +dataset = load_dataset( + card="cards.squad", + template="templates.qa.with_context.simple", + metrics=[ensemble_metric], + loader_limit=20, +) +test_dataset = dataset["test"] + +# Infer a model to get predictions. +model_name = "google/flan-t5-base" +inference_model = HFPipelineBasedInferenceEngine( + model_name=model_name, max_new_tokens=32 +) +predictions = inference_model.infer(test_dataset) + +# Evaluate the predictions using the defined metric. +evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + +# Print results +for instance in evaluated_dataset: + print_dict( + instance, + keys_to_print=[ + "source", + "prediction", + "processed_prediction", + "references", + "score", + ], + ) diff --git a/examples/standalone_evaluation_llm_as_judge.py b/examples/standalone_evaluation_llm_as_judge.py index 20ae7ad31..1561d4d29 100644 --- a/examples/standalone_evaluation_llm_as_judge.py +++ b/examples/standalone_evaluation_llm_as_judge.py @@ -14,18 +14,42 @@ # First, we define the examples data. data = { "test": [ - {"question": "What is the capital of Texas?", "answer": ""}, - {"question": "What is the color of the sky?", "answer": ""}, + { + "query": "What is the capital of Texas?", + "document": "The capital of Texas is Austin.", + "reference_answer": "Austin", + }, + { + "query": "What is the color of the sky?", + "document": "The sky is generally black during the night.", + "reference_answer": "Black", + }, ] } # Second, We define the prompt we show to the judge. +# +# Note that "question" is the full input provided to the original model, and "answer" is the original model +# output. For example , this is sample input provided to the LLM as judge model. +# +# Please act as an impartial judge and evaluate if the assistant's answer is correct. Answer "[[10]]" if the answer is accurate, and "[[0]]" if the answer is wrong. Please use the exact format of the verdict as "[[rate]]". +# You can explain your answer after the verdict. +# [User's input] +# Answer the following query based on the provided document. +# Document: +# The sky is generally black during the night. +# Query: +# What is the color of the sky? +# +# [Assistant's Answer] +# black + judge_correctness_template = InputOutputTemplate( instruction="Please act as an impartial judge and evaluate if the assistant's answer is correct." ' Answer "[[10]]" if the answer is accurate, and "[[0]]" if the answer is wrong. ' 'Please use the exact format of the verdict as "[[rate]]". ' "You can explain your answer after the verdict" ".\n\n", - input_format="[Question]\n{question}\n\n" "[Assistant's Answer]\n{answer}\n", + input_format="[User's input]\n{question}\n" "[Assistant's Answer]\n{answer}\n", output_format="[[{rating}]]", postprocessors=[ r"processors.extract_mt_bench_rating_judgment", @@ -56,17 +80,17 @@ card = TaskCard( loader=LoadFromDictionary(data=data), task=Task( - input_fields={"question": "str"}, - reference_fields={"answer": "str"}, - prediction_type="str", + input_fields={"query": str, "document": str}, + reference_fields={"reference_answer": str}, + prediction_type=str, metrics=[llm_judge_metric], ), templates=TemplatesDict( { "simple": InputOutputTemplate( - instruction="Answer the following question.", - input_format="{question}", - output_format="{answer}", + instruction="Answer the following query based on the provided document.", + input_format="Document:\n{document}\nQuery:\n{query}", + output_format="{reference_answer}", postprocessors=["processors.lower_case"], ) } diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py index 44e2c50d4..0db61fd0a 100644 --- a/examples/standalone_qa_evaluation.py +++ b/examples/standalone_qa_evaluation.py @@ -24,9 +24,9 @@ loader=LoadFromDictionary(data=data), # Define the QA task input and output and metrics. task=Task( - input_fields={"question": "str"}, - reference_fields={"answer": "str"}, - prediction_type="str", + input_fields={"question": str}, + reference_fields={"answer": str}, + prediction_type=str, metrics=["metrics.accuracy"], ), # Create a simple template that formats the input. diff --git a/prepare/cards/arena_hard/common.py b/prepare/cards/arena_hard/common.py new file mode 100644 index 000000000..b391c706b --- /dev/null +++ b/prepare/cards/arena_hard/common.py @@ -0,0 +1,150 @@ +from unitxt import add_to_catalog +from unitxt.operator import SequentialOperator +from unitxt.operators import ( + Apply, + Copy, + FilterByCondition, + RenameFields, + SelectFields, + Set, +) +from unitxt.splitters import RenameSplits +from unitxt.stream_operators import DeleteSplits, JoinStreams + +arena_hard_scores = ["A=B", "A>B", "A>>B", "B>A", "B>>A"] + +arena_hard_hf_space_processing_steps = SequentialOperator( + steps=[ + # region Question file + RenameFields( + field_to_field={"cluster": "group"}, apply_to_streams=["questions"] + ), + Copy( + field_to_field={"turns/0/content": "model_input"}, + apply_to_streams=["questions"], + ), + # endregion + # region Answers file processing + Copy( + field_to_field={ + "choices/0/turns/0/content": "model_output", + "choices/0/turns/0/token_len": "model_output_token_len", + }, + apply_to_streams=["model_answer"], + ), + Apply( + "model_id", + function="str.lower", + to_field="model_id", + apply_to_streams=["model_answer"], + ), + # endregion + # region Judgment file + Copy( + field_to_field={ + "games/0/user_prompt": "judge_input_model_1_ordered_first", + "games/1/user_prompt": "judge_input_model_2_ordered_first", + "games/0/judgment": "judge_output_model_1_ordered_first", + "games/1/judgment": "judge_output_model_2_ordered_first", + "games/0/score": "score_model_1_ordered_first", + "games/1/score": "score_model_2_ordered_first", + }, + apply_to_streams=["judgment"], + ), + RenameFields( + field_to_field={"model": "model_2", "judge": "judge_model_id"}, + apply_to_streams=["judgment"], + ), + Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]), + Apply( + "judge_input_model_1_ordered_first", + function="str", + to_field="judge_input_model_1_ordered_first", + apply_to_streams=["judgment"], + ), + Apply( + "judge_input_model_2_ordered_first", + function="str", + to_field="judge_input_model_2_ordered_first", + apply_to_streams=["judgment"], + ), + Apply( + "model_1", + function="str.lower", + to_field="model_1", + apply_to_streams=["judgment"], + ), + Apply( + "model_2", + function="str.lower", + to_field="model_2", + apply_to_streams=["judgment"], + ), + FilterByCondition( + values={ + "score_model_1_ordered_first": arena_hard_scores, + "score_model_2_ordered_first": arena_hard_scores, + }, + condition="in", + apply_to_streams=["judgment"], + ), + # endregion + # region Join + JoinStreams( + left_stream="questions", + right_stream="judgment", + how="inner", + on=["question_id"], + new_stream_name="merged_stream", + ), + RenameFields( + field_to_field={"model_id": "model_1", "model_output": "model_1_output"}, + apply_to_streams=["model_answer"], + ), + JoinStreams( + left_stream="merged_stream", + right_stream="model_answer", + how="inner", + on=["question_id", "model_1"], + new_stream_name="merged_stream", + ), + RenameFields( + field_to_field={"model_1": "model_2", "model_1_output": "model_2_output"}, + apply_to_streams=["model_answer"], + ), + JoinStreams( + left_stream="merged_stream", + right_stream="model_answer", + how="inner", + on=["question_id", "model_2"], + new_stream_name="merged_stream", + ), + # endregion + DeleteSplits(splits=["questions", "model_answer", "judgment"]), + RenameSplits({"merged_stream": "test"}), + SelectFields( + fields=[ + "question_id", + "category", + "model_input", + "model_1", + "model_2", + "judge_model_id", + "model_1_output", + "model_2_output", + "score_model_1_ordered_first", + "score_model_2_ordered_first", + "judge_input_model_1_ordered_first", + "judge_input_model_2_ordered_first", + "judge_output_model_1_ordered_first", + "judge_output_model_2_ordered_first", + ] + ), + ] +) + +add_to_catalog( + arena_hard_hf_space_processing_steps, + "operators.arena_hard_hf_space_processing_steps", + overwrite=True, +) diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py new file mode 100644 index 000000000..4293028d9 --- /dev/null +++ b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py @@ -0,0 +1,96 @@ +from unitxt.blocks import ( + TaskCard, +) +from unitxt.catalog import add_to_catalog +from unitxt.loaders import LoadFromHFSpace +from unitxt.operators import ( + Apply, + Copy, + RenameFields, + SelectFields, + Set, +) +from unitxt.stream_operators import DeleteSplits, JoinStreams +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadFromHFSpace( + space_name="lmsys/arena-hard-browser", + revision="03b91ca", # May 26, 2024 + data_files={ + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl", + }, + ), + preprocess_steps=[ + # region Question file + RenameFields( + field_to_field={"cluster": "group"}, apply_to_streams=["questions"] + ), + Copy( + field_to_field={"turns/0/content": "model_input"}, + apply_to_streams=["questions"], + ), + Set(fields={"reference_model": "gpt-4-0314"}, apply_to_streams=["questions"]), + # endregion + # region Answers file processing + Copy( + field_to_field={ + "choices/0/turns/0/content": "reference_model_output", + "choices/0/turns/0/token_len": "reference_model_output_token_len", + }, + apply_to_streams=["model_answer"], + ), + RenameFields( + field_to_field={"model_id": "reference_model"}, + apply_to_streams=["model_answer"], + ), + Apply( + "reference_model", + function="str.lower", + to_field="reference_model", + apply_to_streams=["model_answer"], + ), + # endregion + # region Join + JoinStreams( + left_stream="questions", + right_stream="model_answer", + how="inner", + on=["question_id", "reference_model"], + new_stream_name="test", + ), + DeleteSplits(splits=["questions", "model_answer"]), + SelectFields( + fields=[ + "question_id", + "category", + "model_input", + "reference_model", + "reference_model_output", + ] + ), + RenameFields( + field_to_field={ + "model_input": "input", + "category": "group", + "reference_model_output": "output", + } + ), + Set( + fields={ + "type_of_input": "prompt", + "type_of_output": "answer", + } + ), + ], + task="tasks.generation", + templates=["templates.empty"], +) + +test_card(card, demos_taken_from="test", strict=False, loader_limit=100) +add_to_catalog( + card, + "cards.arena_hard.generation.english_gpt_4_0314_reference", + overwrite=True, +) diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py new file mode 100644 index 000000000..a19350100 --- /dev/null +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py @@ -0,0 +1,77 @@ +from unitxt.blocks import ( + TaskCard, +) +from unitxt.catalog import add_to_catalog +from unitxt.loaders import LoadFromHFSpace +from unitxt.operators import ( + MapInstanceValues, + MergeStreams, + RenameFields, +) +from unitxt.stream_operators import DeleteSplits, DuplicateSplit +from unitxt.test_utils.card import test_card + +score_mapper = {"A=B": 0, "A>B": 1, "A>>B": 3, "B>A": -1, "B>>A": -3} + +card = TaskCard( + loader=LoadFromHFSpace( + space_name="lmsys/arena-hard-browser", + revision="03b91ca", # May 26, 2024 + data_files={ + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl", + }, + ), + preprocess_steps=[ + "operators.arena_hard_hf_space_processing_steps", + DuplicateSplit(split="test", to_split="game_2"), + RenameFields( + field_to_field={ + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "score_model_1_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_a", + "model_2": "model_b", + }, + apply_to_streams=["test"], + ), + RenameFields( + field_to_field={ + "model_input": "question", + "model_1_output": "answer_b", + "model_2_output": "answer_a", + "score_model_2_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_b", + "model_2": "model_a", + }, + apply_to_streams=["game_2"], + ), + MergeStreams( + streams_to_merge=["test", "game_2"], + new_stream_name="test", + add_origin_stream_name=False, + ), + DeleteSplits(splits=["game_2"]), + MapInstanceValues( + { + "answer_a_preference": score_mapper, + } + ), + ], + task="tasks.response_assessment.pairwise_comparative_rating.single_turn", + templates=[ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + ], +) + +test_card(card, demos_taken_from="test", strict=False, loader_limit=100000) +add_to_catalog( + card, + "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge", + overwrite=True, +) diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py new file mode 100644 index 000000000..a53ba7dd6 --- /dev/null +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py @@ -0,0 +1,62 @@ +from unitxt.blocks import ( + TaskCard, +) +from unitxt.catalog import add_to_catalog +from unitxt.loaders import LoadFromHFSpace +from unitxt.operators import ( + ExecuteExpression, + MapInstanceValues, + RenameFields, +) +from unitxt.test_utils.card import test_card + +score_mapper = {"A=B": 0, "A>B": 1, "A>>B": 3, "B>A": -1, "B>>A": -3} + +score_mapper_reversed = {k: -1 * v for k, v in score_mapper.items()} + +card = TaskCard( + loader=LoadFromHFSpace( + space_name="lmsys/arena-hard-browser", + revision="03b91ca", # May 26, 2024 + data_files={ + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl", + }, + ), + preprocess_steps=[ + "operators.arena_hard_hf_space_processing_steps", + MapInstanceValues( + { + "score_model_1_ordered_first": score_mapper, + "score_model_2_ordered_first": score_mapper_reversed, + } + ), + ExecuteExpression( + to_field="answer_a_preference", + expression="int(round((score_model_1_ordered_first+score_model_2_ordered_first)/2))", + ), + RenameFields( + field_to_field={ + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "category": "group", + "model_1": "model_a", + "model_2": "model_b", + } + ), + ], + task="tasks.response_assessment.pairwise_comparative_rating.single_turn", + templates=[ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + ], +) + +test_card(card, demos_taken_from="test", strict=False, loader_limit=100000) +add_to_catalog( + card, + "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge", + overwrite=True, +) diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py new file mode 100644 index 000000000..5b4ace971 --- /dev/null +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py @@ -0,0 +1,55 @@ +from unitxt.blocks import ( + TaskCard, +) +from unitxt.catalog import add_to_catalog +from unitxt.loaders import LoadFromHFSpace +from unitxt.operators import ( + MapInstanceValues, + RenameFields, +) +from unitxt.test_utils.card import test_card + +score_mapper = {"A=B": 0, "A>B": 1, "A>>B": 3, "B>A": -1, "B>>A": -3} + +card = TaskCard( + loader=LoadFromHFSpace( + space_name="lmsys/arena-hard-browser", + revision="03b91ca", # May 26, 2024 + data_files={ + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl", + }, + ), + preprocess_steps=[ + "operators.arena_hard_hf_space_processing_steps", + RenameFields( + field_to_field={ + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "score_model_1_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_a", + "model_2": "model_b", + }, + ), + MapInstanceValues( + { + "answer_a_preference": score_mapper, + } + ), + ], + task="tasks.response_assessment.pairwise_comparative_rating.single_turn", + templates=[ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + ], +) + +test_card(card, demos_taken_from="test", strict=False, loader_limit=100000) +add_to_catalog( + card, + "cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge", + overwrite=True, +) diff --git a/prepare/cards/billsum.py b/prepare/cards/billsum.py index 9caedf77b..b16c53bbb 100644 --- a/prepare/cards/billsum.py +++ b/prepare/cards/billsum.py @@ -1,9 +1,7 @@ -import sys - from unitxt import add_to_catalog from unitxt.blocks import Set, SplitRandomMix, TaskCard from unitxt.loaders import LoadHF -from unitxt.operators import FilterByExpression, RenameFields, Shuffle +from unitxt.operators import FilterByExpression, RenameFields from unitxt.test_utils.card import test_card # https://huggingface.co/datasets/billsum @@ -16,7 +14,6 @@ SplitRandomMix( {"train": "train[87.5%]", "validation": "train[12.5%]", "test": "test"} ), - Shuffle(page_size=sys.maxsize), RenameFields(field_to_field={"text": "document"}), Set(fields={"document_type": "document"}), ] diff --git a/prepare/cards/rag/response_generation/clapnq.py b/prepare/cards/clapnq.py similarity index 65% rename from prepare/cards/rag/response_generation/clapnq.py rename to prepare/cards/clapnq.py index e76f6f272..d9575efea 100644 --- a/prepare/cards/rag/response_generation/clapnq.py +++ b/prepare/cards/clapnq.py @@ -9,10 +9,24 @@ ) from unitxt.operators import ( Copy, + MapInstanceValues, Set, ) from unitxt.test_utils.card import test_card +unanswerable_responses = [ + "I'm sorry, I cannot answer this question based on the context.", + "The answer is not in the text provided.", + "Unanswerable.", + "The provided context does not contain the information needed to answer this question.", + "There is not enough information in the text to answer this question.", + "The text does not provide an answer to this question.", + "Based on the context, an answer cannot be determined.", + "The answer to this question is not available in the provided context.", + "This question cannot be answered with the given information.", + "Insufficient context to provide an answer.", +] + card = TaskCard( loader=LoadHF( path="PrimeQA/clapnq", @@ -31,6 +45,10 @@ "contexts_ids": [], } ), + MapInstanceValues( + mappers={"reference_answers": {"['']": unanswerable_responses}}, + strict=False, + ), ], task="tasks.rag.response_generation", templates=TemplatesDict( diff --git a/prepare/cards/coedit.py b/prepare/cards/coedit.py index aba922397..432cc6d0e 100644 --- a/prepare/cards/coedit.py +++ b/prepare/cards/coedit.py @@ -120,7 +120,8 @@ "input_type": "sentence", } ), - RenameFields(field_to_field={"src": "input", "tgt": "output_choice"}), + RenameFields(field="src", to_field="input"), + IndexOf(search_in="choices", index_of="tgt", to_field="output_choice"), ], task="tasks.evaluation.preference", templates="templates.evaluation.preference.all", diff --git a/prepare/cards/cohere_for_ai.py b/prepare/cards/cohere_for_ai.py index 9b5d6e6a3..ea6325147 100644 --- a/prepare/cards/cohere_for_ai.py +++ b/prepare/cards/cohere_for_ai.py @@ -173,5 +173,4 @@ card, f"cards.cohere_for_ai.{subset}.{lang}", overwrite=True, - catalog_path="src/unitxt/catalog", ) diff --git a/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py b/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py index ace96f2d7..8420f3edf 100644 --- a/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py +++ b/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py @@ -4,6 +4,7 @@ tasks = [ "tasks.response_assessment.rating.single_turn", "tasks.response_assessment.rating.single_turn_with_reference", + "tasks.response_assessment.pairwise_comparative_rating.single_turn", ] for task in tasks: card = TaskCard(loader=None, preprocess_steps=[], task=task) diff --git a/prepare/cards/fin_qa.py b/prepare/cards/fin_qa.py index c502bdc66..bb6341855 100644 --- a/prepare/cards/fin_qa.py +++ b/prepare/cards/fin_qa.py @@ -5,7 +5,7 @@ TemplatesList, ) from unitxt.catalog import add_to_catalog -from unitxt.operators import CopyFields +from unitxt.operators import CopyFields, FilterByExpression from unitxt.struct_data_operators import MapTableListsToStdTableJSON from unitxt.task import Task from unitxt.templates import InputOutputTemplate @@ -14,6 +14,7 @@ card = TaskCard( loader=LoadHF(path="ibm/finqa", streaming=False), preprocess_steps=[ + FilterByExpression(expression="len(table) > 1"), CopyFields(field_to_field=[["pre_text/0", "pre_text"]]), CopyFields(field_to_field=[["post_text/0", "post_text"]]), MapTableListsToStdTableJSON(field_to_field=[["table", "stdtable"]]), @@ -23,13 +24,13 @@ ], task=Task( inputs={ - "pre_text": "str", - "serialized_table": "str", - "post_text": "str", - "question": "str", + "pre_text": str, + "serialized_table": str, + "post_text": str, + "question": str, }, - outputs={"program_re": "str", "answer": "str"}, - prediction_type="str", + outputs={"program_re": str, "answer": str}, + prediction_type=str, metrics=["metrics.fin_qa_metric"], augmentable_inputs=["pre_text", "serialized_table", "post_text", "question"], ), diff --git a/prepare/cards/hh_rlhf.py b/prepare/cards/hh_rlhf.py index 38216db17..67677d1d8 100644 --- a/prepare/cards/hh_rlhf.py +++ b/prepare/cards/hh_rlhf.py @@ -4,7 +4,7 @@ ) from unitxt.catalog import add_to_catalog from unitxt.collections_operators import Get, Slice -from unitxt.operators import ListFieldValues, Set, ShuffleFieldValues +from unitxt.operators import IndexOf, ListFieldValues, Set, ShuffleFieldValues from unitxt.string_operators import Join, Replace, Split, Strip from unitxt.test_utils.card import test_card @@ -32,6 +32,9 @@ "instruction": "Respond the following dialog in an helpful and harmfull way.", } ), + IndexOf( + search_in="choices", index_of="output_choice", to_field="output_choice" + ), ], task="tasks.evaluation.preference", templates="templates.evaluation.preference.all", diff --git a/prepare/cards/mt_bench/common.py b/prepare/cards/mt_bench/common.py index 8757426ce..030cdb932 100644 --- a/prepare/cards/mt_bench/common.py +++ b/prepare/cards/mt_bench/common.py @@ -19,12 +19,12 @@ RenameFields( field_to_field={ "model": "model_id", - "judge": "judge_model_id", "user_prompt": "judge_input", "judgment": "judge_output", }, apply_to_streams=["judgment"], ), + Copy(field="judge/0", to_field="judge_model_id", apply_to_streams=["judgment"]), RenameFields( field_to_field={"choices": "model_output"}, apply_to_streams=["model_answer"], @@ -93,7 +93,6 @@ # region Judgment file RenameFields( field_to_field={ - "judge": "judge_model_id", "g1_user_prompt": "judge_input_model_1_ordered_first", "g2_user_prompt": "judge_input_model_2_ordered_first", "g1_judgment": "judge_output_model_1_ordered_first", @@ -103,6 +102,7 @@ }, apply_to_streams=["judgment"], ), + Copy(field="judge/0", to_field="judge_model_id", apply_to_streams=["judgment"]), Apply( "model_1", function="str.lower", diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py index d4802034b..9a573be5c 100644 --- a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py +++ b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py @@ -51,7 +51,7 @@ ], task="tasks.response_assessment.pairwise_comparison.multi_turn", templates=[ - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffling" ], ) diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py index 883a5379a..1decda59e 100644 --- a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py +++ b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py @@ -52,7 +52,7 @@ ], task="tasks.response_assessment.pairwise_comparison.multi_turn_with_reference", templates=[ - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffling" ], ) diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py index 3107b779b..d22f1e0d6 100644 --- a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py +++ b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py @@ -47,7 +47,7 @@ ], task="tasks.response_assessment.pairwise_comparison.single_turn", templates=[ - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffling" ], ) diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py index 634e018d5..8a9168325 100644 --- a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py +++ b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py @@ -49,7 +49,7 @@ ], task="tasks.response_assessment.pairwise_comparison.single_turn_with_reference", templates=[ - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffling" ], ) diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py index 471acbb3f..98b4adb12 100644 --- a/prepare/cards/numeric_nlg.py +++ b/prepare/cards/numeric_nlg.py @@ -12,13 +12,20 @@ card = TaskCard( loader=LoadHF(path="kasnerz/numericnlg"), # TODO: load from github repo preprocess_steps=[ - Set(fields={"type_of_input": "table", "type_of_output": "description"}), + Set( + fields={ + "type_of_input_a": "table", + "type_of_input_b": "caption", + "type_of_output": "description", + } + ), MapHTMLTableToJSON(field="table_html_clean", to_field="table_out"), - SerializeTableAsMarkdown(field="table_out", to_field="input"), + SerializeTableAsMarkdown(field="table_out", to_field="input_a"), RenameFields(field="description", to_field="output"), + RenameFields(field="caption", to_field="input_b"), ], - task="tasks.generation[metrics=[metrics.bleu,metrics.rouge,metrics.bert_score.bert_base_uncased,metrics.meteor]]", - templates="templates.generation.all", + task="tasks.generation.from_pair", + templates="templates.generation.from_pair.all", __description__="NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers.", __tags__={ "modality": "table", diff --git a/prepare/cards/rag/end_to_end/__init__.py b/prepare/cards/rag/end_to_end/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/prepare/cards/rag/end_to_end/clapnq.py b/prepare/cards/rag/end_to_end/clapnq.py new file mode 100644 index 000000000..17236e333 --- /dev/null +++ b/prepare/cards/rag/end_to_end/clapnq.py @@ -0,0 +1,123 @@ +import json +from dataclasses import dataclass + +from unitxt import add_to_catalog +from unitxt.blocks import TaskCard, TemplatesDict +from unitxt.loaders import LoadCSV +from unitxt.operators import Copy, ListFieldValues, Set +from unitxt.templates import InputOutputTemplate +from unitxt.test_utils.card import test_card + + +@dataclass(frozen=True) +class ClapNqBenchmark: + # Raw_data + TRAIN_RAW_FILE_URL: str = "https://raw.githubusercontent.com/primeqa/clapnq/main/retrieval/train/question_train_answerable.tsv" + TEST_RAW_FILE_URL: str = "https://raw.githubusercontent.com/primeqa/clapnq/main/retrieval/dev/question_dev_answerable.tsv" + + # Fields + ID: str = "id" + QUESTION: str = "question" + DOC_ID_LIST: str = "doc-id-list" + ANSWERS: str = "answers" + + +@dataclass(frozen=True) +class ClapNqDocuments: + # Raw_data + RAW_FILE_URL: str = "https://media.githubusercontent.com/media/primeqa/clapnq/main/retrieval/passages.tsv" + + # Fields + ID: str = "id" + TEXT: str = "text" + TITLE: str = "title" + + ARTIFACT_NAME: str = "cards.rag.documents.clap_nq.en" + + +card = TaskCard( + loader=LoadCSV( + sep="\t", + files={ + "train": ClapNqBenchmark.TRAIN_RAW_FILE_URL, + "test": ClapNqBenchmark.TEST_RAW_FILE_URL, + }, + ), + preprocess_steps=[ + Copy( + field_to_field={ + ClapNqBenchmark.QUESTION: "question", + ClapNqBenchmark.ID: "question_id", + }, + ), + Set( + fields={ + "reference_contexts": [], + "is_answerable_label": True, + "metadata_field": "", + } + ), + ListFieldValues( + fields=[ClapNqBenchmark.DOC_ID_LIST], + to_field="reference_context_ids", + ), + ListFieldValues( + fields=[ClapNqBenchmark.ANSWERS], + to_field="reference_answers", + ), + ], + task="tasks.rag.end_to_end", + # templates=["templates.empty"], + templates=TemplatesDict({"default": "templates.rag.end_to_end.json_predictions"}), +) + +wrong_answer = { + "contexts": ["hi"], + "is_answerable": True, + "answer": "Don't know", + "context_ids": ["id0"], +} +test_card( + card, + strict=True, + full_mismatch_prediction_values=[json.dumps(wrong_answer)], + debug=False, + demos_taken_from="test", + demos_pool_size=5, +) + +add_to_catalog(card, "cards.rag.benchmark.clap_nq.en", overwrite=True) + +# Documents +card = TaskCard( + loader=LoadCSV(sep="\t", files={"train": ClapNqDocuments.RAW_FILE_URL}), + preprocess_steps=[ + Copy( + field_to_field={ + ClapNqDocuments.ID: "document_id", + ClapNqDocuments.TITLE: "title", + }, + ), + ListFieldValues( + fields=[ClapNqDocuments.TEXT], + to_field="passages", + ), + Set( + fields={ + "metadata_field": "", + } + ), + ], + task="tasks.rag.corpora", + templates=TemplatesDict( + { + "empty": InputOutputTemplate( + input_format="", + output_format="", + ), + } + ), +) + +# Not testing card, because documents are not evaluated. +add_to_catalog(card, "cards.rag.documents.clap_nq.en", overwrite=True) diff --git a/prepare/cards/reward_bench.py b/prepare/cards/reward_bench.py new file mode 100644 index 000000000..fd779b0bd --- /dev/null +++ b/prepare/cards/reward_bench.py @@ -0,0 +1,68 @@ +from unitxt import add_to_catalog +from unitxt.blocks import ( + LoadHF, + TaskCard, +) +from unitxt.operators import FilterByCondition, RenameFields, Set +from unitxt.splitters import RenameSplits +from unitxt.test_utils.card import test_card + +subset_dict = { + "chat": [ + "alpacaeval-easy", + "alpacaeval-length", + "alpacaeval-hard", + "mt-bench-easy", + "mt-bench-med", + ], + "chat-hard": [ + "mt-bench-hard", + "llmbar-natural", + "llmbar-adver-neighbor", + "llmbar-adver-GPTInst", + "llmbar-adver-GPTOut", + "llmbar-adver-manual", + ], + "safety": [ + "refusals-dangerous", + "refusals-offensive", + "xstest-should-refuse", + "xstest-should-respond", + "donotanswer", + ], + "reasoning": [ + "math-prm", + "hep-cpp", + "hep-go", + "hep-java", + "hep-js", + "hep-python", + "hep-rust", + ], +} + +for subset in subset_dict.keys(): + card = TaskCard( + loader=LoadHF(path="allenai/reward-bench", split="filtered"), + preprocess_steps=[ + RenameSplits({"filtered": "test"}), + RenameFields( + field_to_field={ + "prompt": "question", + "chosen": "answer_a", + "rejected": "answer_b", + "subset": "group", + } + ), + Set(fields={"winner": "choice_a"}), + FilterByCondition(values={"group": subset_dict[subset]}, condition="in"), + ], + task="tasks.response_assessment.pairwise_comparison.single_turn", + templates=[ + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn" + ], + ) + + test_card(card, demos_taken_from="test", strict=False, loader_limit=10000) + subset_label = subset.replace("-", "_") + add_to_catalog(card, f"cards.reward_bench.{subset_label}", overwrite=True) diff --git a/prepare/cards/tldr.py b/prepare/cards/tldr.py index c88bdd148..7ad358aba 100644 --- a/prepare/cards/tldr.py +++ b/prepare/cards/tldr.py @@ -9,7 +9,13 @@ card = TaskCard( loader=LoadHF(path="webis/tldr-17", streaming=True), preprocess_steps=[ - SplitRandomMix({"train": "train[50%]", "test": "train[50%]"}), + SplitRandomMix( + { + "train": "train[70%]", + "validation": "train[15%]", + "test": "train[15%]", + } + ), RenameFields(field_to_field={"content": "document"}), Set(fields={"document_type": "document"}), ] @@ -46,6 +52,6 @@ ) add_to_catalog( card, - f"cards.tldr{f'_document_filtered_to_{n_chars_to_filter_by}_chars' if n_chars_to_filter_by!='max' else ''}", + f"cards.tldr{f'_document_filtered_to_{n_chars_to_filter_by}_chars' if n_chars_to_filter_by != 'max' else ''}", overwrite=True, ) diff --git a/prepare/cards/translation/flores101.py b/prepare/cards/translation/flores101.py new file mode 100644 index 000000000..cf00e1f04 --- /dev/null +++ b/prepare/cards/translation/flores101.py @@ -0,0 +1,166 @@ +from unitxt.blocks import Copy, LoadHF, Set, SplitRandomMix, TaskCard +from unitxt.catalog import add_to_catalog +from unitxt.test_utils.card import test_card + +# https://localizely.com/iso-639-2-list/ +iso_lang_code_mapping = { + "eng": "English", + "afr": "Afrikaans", + "amh": "Amharic", + "ara": "Arabic", + "hye": "Armenian", + "asm": "Assamese", + "ast": "Asturian", + "azj": "Azerbaijani", + "bel": "Belarusian", + "ben": "Bengali", + "bos": "Bosnian", + "bul": "Bulgarian", + "mya": "Burmese", + "cat": "Catalan", + "ceb": "Cebuano", + "zho_simpl": "Chinese (Simplified)", + "zho_trad": "Chinese (Traditional)", + "hrv": "Croatian", + "ces": "Czech", + "dan": "Danish", + "nld": "Dutch", + "est": "Estonian", + "tgl": "Tagalog", + "fin": "Finnish", + "fra": "French", + "ful": "Fulah", + "glg": "Galician", + "lug": "Ganda", + "kat": "Georgian", + "deu": "German", + "ell": "Greek", + "guj": "Gujarati", + "hau": "Hausa", + "heb": "Hebrew", + "hin": "Hindi", + "hun": "Hungarian", + "isl": "Icelandic", + "ibo": "Igbo", + "ind": "Indonesian", + "gle": "Irish", + "ita": "Italian", + "jpn": "Japanese", + "jav": "Javanese", + "kea": "Kabuverdianu", + "kam": "Kamba", + "kan": "Kannada", + "kaz": "Kazakh", + "khm": "Khmer", + "kor": "Korean", + "kir": "Kyrgyz", + "lao": "Lao", + "lav": "Latvian", + "lin": "Lingala", + "lit": "Lithuanian", + "luo": "Dholuo", + "ltz": "Luxembourgish", + "mkd": "Macedonian", + "msa": "Malay", + "mal": "Malayalam", + "mlt": "Maltese", + "mri": "Maori", + "mar": "Marathi", + "mon": "Mongolian", + "npi": "Nepali", + "nso": "Northern Sotho", + "nob": "Norwegian Bokmål", + "nya": "Nyanja", + "oci": "Occitan", + "ory": "Odia", + "orm": "Oromo", + "pus": "Pashto", + "fas": "Persian", + "pol": "Polish", + "por": "Portuguese", + "pan": "Punjabi", + "ron": "Romanian", + "rus": "Russian", + "srp": "Serbian", + "sna": "Shona", + "snd": "Sindhi", + "slk": "Slovak", + "slv": "Slovenian", + "som": "Somali", + "ckb": "Sorani Kurdish", + "spa": "Spanish", + "swh": "Swahili", + "swe": "Swedish", + "tgk": "Tajik", + "tam": "Tamil", + "tel": "Telugu", + "tha": "Thai", + "tur": "Turkish", + "ukr": "Ukrainian", + "umb": "Umbundu", + "urd": "Urdu", + "uzb": "Uzbek", + "vie": "Vietnamese", + "cym": "Welsh", + "wol": "Wolof", + "xho": "Xhosa", + "yor": "Yoruba", + "zul": "Zulu", +} + + +langs_to_include = [ # langs currently supported by sacrebleu + "ara", + "fra", + "deu", + "jpn", + "kor", + "por", + "ron", + "spa", +] + +langs = [ + lang + for lang in iso_lang_code_mapping.keys() + if ("eng" not in lang and lang in langs_to_include) +] +pairs = [{"src": lang, "tgt": "eng"} for lang in langs] + [ + {"src": "eng", "tgt": lang} for lang in langs +] + +for pair in pairs: + card = TaskCard( + loader=LoadHF(path="gsarti/flores_101", name="all"), + preprocess_steps=[ + SplitRandomMix({"validation": "dev", "test": "devtest"}), + Copy( + field_to_field={ + f"sentence_{pair['src']}": "text", + f"sentence_{pair['tgt']}": "translation", + }, + ), + Set( + fields={ + "source_language": iso_lang_code_mapping[pair["src"]].lower(), + "target_language": iso_lang_code_mapping[pair["tgt"]].lower(), + } + ), + ], + task="tasks.translation.directed", + templates="templates.translation.directed.all", + ) + + test_card(card, demos_taken_from="test") + add_to_catalog( + card, f"cards.mt.flores_101.{pair['src']}_{pair['tgt']}", overwrite=True + ) + +if __name__ == "__main__": + from unitxt import load_dataset + + ds = load_dataset( + "card=cards.mt.flores_101.eng_deu,template_card_index=0", + ) + + ds["test"][0] diff --git a/prepare/cards/wmt/en_de.py b/prepare/cards/translation/wmt/en_de.py similarity index 100% rename from prepare/cards/wmt/en_de.py rename to prepare/cards/translation/wmt/en_de.py diff --git a/prepare/cards/wmt/en_fr.py b/prepare/cards/translation/wmt/en_fr.py similarity index 100% rename from prepare/cards/wmt/en_fr.py rename to prepare/cards/translation/wmt/en_fr.py diff --git a/prepare/cards/wmt/en_ro.py b/prepare/cards/translation/wmt/en_ro.py similarity index 100% rename from prepare/cards/wmt/en_ro.py rename to prepare/cards/translation/wmt/en_ro.py diff --git a/prepare/cards/universal_ner.py b/prepare/cards/universal_ner.py index 84ae5584d..77720c793 100644 --- a/prepare/cards/universal_ner.py +++ b/prepare/cards/universal_ner.py @@ -1,3 +1,5 @@ +import sys + from unitxt import add_to_catalog from unitxt.blocks import LoadHF, TaskCard from unitxt.operators import ( @@ -5,6 +7,7 @@ GetItemByIndex, RenameFields, Set, + Shuffle, ) from unitxt.span_lableing_operators import IobExtractor from unitxt.test_utils.card import test_card @@ -48,6 +51,8 @@ requirements_list=["conllu"], ), preprocess_steps=[ + # The dataset is sorted by classes + Shuffle(page_size=sys.maxsize), RenameFields( field_to_field={"ner_tags": "labels"}, ), diff --git a/prepare/metrics/bleu.py b/prepare/metrics/bleu.py index cadd04aa5..6fc5b2e4c 100644 --- a/prepare/metrics/bleu.py +++ b/prepare/metrics/bleu.py @@ -3,7 +3,7 @@ from unitxt.test_utils.metrics import test_metric metric = HuggingfaceMetric( - hf_metric_name="bleu", main_score="bleu", scale=1.0, prediction_type="str" + hf_metric_name="bleu", main_score="bleu", scale=1.0, prediction_type=str ) predictions = ["hello there general kenobi", "foo bar foobar", "", "not empty"] diff --git a/prepare/metrics/code_mixing_detection.py b/prepare/metrics/code_mixing_detection.py index fc4a5eff4..8f3f47525 100644 --- a/prepare/metrics/code_mixing_detection.py +++ b/prepare/metrics/code_mixing_detection.py @@ -1,4 +1,3 @@ -import torch from unitxt import add_to_catalog from unitxt.logging_utils import get_logger from unitxt.metrics import IsCodeMixed @@ -35,9 +34,9 @@ metric = IsCodeMixed() -if not torch.cuda.is_available() and not torch.backends.mps.is_available(): - logger.info("no gpu available, cannot test metric") -else: +# Because the metric requires downloading very large model (multiple >10GBs), only run +# the test when explicitly requested. +if __name__ == "__main__": outputs = test_metric( metric=metric, predictions=examples, diff --git a/prepare/metrics/llm_as_judge/llamaguard.py b/prepare/metrics/llm_as_judge/llamaguard.py index 75464515a..d23d87559 100644 --- a/prepare/metrics/llm_as_judge/llamaguard.py +++ b/prepare/metrics/llm_as_judge/llamaguard.py @@ -1,6 +1,7 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model_list = [ "meta-llama/llama-3-8b-instruct", @@ -11,7 +12,9 @@ task = "rating.single_turn" for model_id in model_list: - inference_model = IbmGenAiInferenceEngine(model_name=model_id, max_new_tokens=252) + inference_model = IbmGenAiInferenceEngine( + model_name=model_id, max_new_tokens=252, random_seed=get_seed() + ) model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_ibm_genai_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_ibm_genai_arena_hard_template.py new file mode 100644 index 000000000..856f1b1d4 --- /dev/null +++ b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_ibm_genai_arena_hard_template.py @@ -0,0 +1,36 @@ +from unitxt import add_to_catalog +from unitxt.inference import ( + IbmGenAiInferenceEngine, +) +from unitxt.llm_as_judge import LLMAsJudge + +model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"] +format = "formats.llama3_instruct" +templates = [ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", +] +for template in templates: + task = "pairwise_comparative_rating.single_turn" + + for model_id in model_list: + inference_model = IbmGenAiInferenceEngine( + model_name=model_id, max_new_tokens=2048, random_seed=42 + ) + model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower() + model_label = f"{model_label}_ibm_genai" + template_label = template.split(".")[-1] + metric_label = f"{model_label}_template_{template_label}" + metric = LLMAsJudge( + inference_model=inference_model, + template=template, + task=task, + format=format, + main_score=metric_label, + ) + + add_to_catalog( + metric, + f"metrics.llm_as_judge.pairwise_comparative_rating.{model_label}_template_{template_label}", + overwrite=True, + ) diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py index 961e86c26..931c17cac 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py @@ -1,12 +1,15 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model = "meta-llama/llama-3-70b-instruct" format = "formats.llama3_instruct" template = "templates.response_assessment.rating.generic_single_turn" -inference_model = IbmGenAiInferenceEngine(model_name=model, max_new_tokens=252) +inference_model = IbmGenAiInferenceEngine( + model_name=model, max_new_tokens=252, random_seed=get_seed() +) model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] @@ -17,7 +20,7 @@ task="rating.single_turn", format=format, main_score=metric_label, - prediction_type="str", + prediction_type=str, ) add_to_catalog( diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py index 8a2e1815b..4e04d801c 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py @@ -1,6 +1,7 @@ from unitxt import add_to_catalog from unitxt.inference import IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge +from unitxt.random_utils import get_seed model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"] format = "formats.llama3_instruct" @@ -8,7 +9,9 @@ task = "rating.single_turn" for model_id in model_list: - inference_model = IbmGenAiInferenceEngine(model_name=model_id, max_new_tokens=252) + inference_model = IbmGenAiInferenceEngine( + model_name=model_id, max_new_tokens=252, random_seed=get_seed() + ) model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower() model_label = f"{model_label}_ibm_genai" template_label = template.split(".")[-1] @@ -19,7 +22,7 @@ task=task, format=format, main_score=metric_label, - prediction_type="str", + prediction_type=str, ) add_to_catalog( diff --git a/prepare/metrics/llm_as_judge/rating/mistral_huggingface_mt_bench_template.py b/prepare/metrics/llm_as_judge/rating/mistral_huggingface_mt_bench_template.py index 1db357225..7db0ce367 100644 --- a/prepare/metrics/llm_as_judge/rating/mistral_huggingface_mt_bench_template.py +++ b/prepare/metrics/llm_as_judge/rating/mistral_huggingface_mt_bench_template.py @@ -21,7 +21,7 @@ task=task, format=format, main_score=metric_label, - prediction_type="str", + prediction_type=str, ) add_to_catalog( diff --git a/prepare/metrics/meteor.py b/prepare/metrics/meteor.py index c90b3d7cb..787acdc2d 100644 --- a/prepare/metrics/meteor.py +++ b/prepare/metrics/meteor.py @@ -1,8 +1,65 @@ from unitxt import add_to_catalog -from unitxt.metrics import HuggingfaceMetric +from unitxt.metrics import HuggingfaceMetric, Meteor +from unitxt.test_utils.metrics import test_metric -metric = HuggingfaceMetric( - hf_metric_name="meteor", main_score="meteor", prediction_type="str" +metric = Meteor() + +predictions = [ + "It is a guide to action which ensures that the military always obeys the commands of the party", + "We strive for peace", + "On the rag sat the cat", + "I caught the ball", +] +references = [ + [ + "It is a guide to action that ensures that the military will forever heed Party commands" + ], + ["We hope for peace"], + ["The cat sat on the rag"], + ["He threw the ball"], +] + +# the floats shown here are rounded just for the test. the actually +# returned score are 15-16 digits to the right of the decimal point +instance_targets = [ + {"meteor": 0.69, "score": 0.69, "score_name": "meteor"}, + {"meteor": 0.64, "score": 0.64, "score_name": "meteor"}, + {"meteor": 0.5, "score": 0.5, "score_name": "meteor"}, + {"meteor": 0.47, "score": 0.47, "score_name": "meteor"}, +] + +global_target = { + "meteor": 0.58, + "meteor_ci_high": 0.59, + "meteor_ci_low": 0.58, + "score": 0.58, + "score_ci_high": 0.59, + "score_ci_low": 0.58, + "score_name": "meteor", +} + +metric.n_resamples = 3 +# to match the setting to occur by testing on the global version, metric2, below + +outputs = test_metric( + metric=metric, + predictions=predictions, + references=references, + instance_targets=instance_targets, + global_target=global_target, +) + +# compare results with the HF version of meteor +metric2 = HuggingfaceMetric( + hf_metric_name="meteor", main_score="meteor", prediction_type=str +) + +outputs = test_metric( + metric=metric2, + predictions=predictions, + references=references, + instance_targets=instance_targets, + global_target=global_target, ) add_to_catalog(metric, "metrics.meteor", overwrite=True) diff --git a/prepare/metrics/normalized_sacrebleu.py b/prepare/metrics/normalized_sacrebleu.py index f1b3e68d4..e8e9e6df4 100644 --- a/prepare/metrics/normalized_sacrebleu.py +++ b/prepare/metrics/normalized_sacrebleu.py @@ -28,6 +28,7 @@ metric = MetricPipeline( main_score="sacrebleu", + prediction_type=str, preprocess_steps=[ Copy( field="task_data/target_language", diff --git a/prepare/metrics/rag.py b/prepare/metrics/rag.py index 4f885a962..b303b1007 100644 --- a/prepare/metrics/rag.py +++ b/prepare/metrics/rag.py @@ -410,9 +410,134 @@ ), ], metric=f"metrics.{base_metric}", - prediction_type="str", + prediction_type=str, ) add_to_catalog( metric, f"metrics.rag.response_generation.{axis}.{base_metric}", overwrite=True ) + +# end to end + +end_to_end_artifact_name_to_main_score = { + "metrics.rag.end_to_end.answer_correctness": "recall", + "metrics.rag.end_to_end.answer_reward": "score", + "metrics.rag.end_to_end.answer_faithfulness": "precision", + "metrics.rag.end_to_end.context_correctness": "score", + "metrics.rag.end_to_end.context_relevance": "score", +} + +end_to_end_artifact_names_to_main_metric = { + "metrics.rag.end_to_end.answer_correctness": "metrics.token_overlap", + "metrics.rag.end_to_end.answer_reward": "metrics.reward.deberta_v3_large_v2", + "metrics.rag.end_to_end.answer_faithfulness": "metrics.token_overlap", + "metrics.rag.end_to_end.context_correctness": "metrics.mrr", + "metrics.rag.end_to_end.context_relevance": "metrics.perplexity_q.flan_t5_small", +} + +assert len(end_to_end_artifact_name_to_main_score) == len( + end_to_end_artifact_names_to_main_metric +) + +copy_field_prediction_answer_to_prediction = Copy( + field_to_field=[ + ( + "prediction/answer", + "prediction", + ) + ], +) + +copy_field_reference_answers_to_references = Copy( + field_to_field={"task_data/reference_answers": "references"}, +) + +copy_field_reference_contexts_to_references = Copy( + field_to_field={"task_data/reference_contexts": "references"} +) + +copy_field_prediction_contexts_to_prediction = Copy( + field_to_field=[ + ( + "prediction/contexts", + "prediction", + ) + ], +) + +copy_field_prediction_context_ids_to_prediction = Copy( + field_to_field=[ + ( + "prediction/context_ids", + "prediction", + ) + ], +) + +copy_field_reference_context_ids_to_references_in_a_list = ListFieldValues( + fields=["task_data/reference_context_ids"], + to_field="references", +) + +copy_field_prediction_contexts_to_references = Copy( + field_to_field=[ + ( + "prediction/contexts", + "references", + ) + ], +) + + +copy_field_question_to_prediction = Copy( + field_to_field=[ + ( + "task_data/question", + "prediction", + ) + ], +) + +copy_field_question_to_references_in_a_list = ListFieldValues( + fields=["task_data/question"], + to_field="references", +) + +end_to_end_artifact_names_to_preprocess_steps = { + "metrics.rag.end_to_end.answer_correctness": [ + copy_field_prediction_answer_to_prediction, + copy_field_reference_answers_to_references, + ], + "metrics.rag.end_to_end.answer_reward": [ + copy_field_prediction_answer_to_prediction, + copy_field_question_to_references_in_a_list, + ], + "metrics.rag.end_to_end.answer_faithfulness": [ + copy_field_prediction_contexts_to_references, + copy_field_prediction_answer_to_prediction, + ], + "metrics.rag.end_to_end.context_correctness": [ + copy_field_prediction_context_ids_to_prediction, + copy_field_reference_context_ids_to_references_in_a_list, + ], + "metrics.rag.end_to_end.context_relevance": [ + copy_field_prediction_contexts_to_references, + copy_field_question_to_prediction, + ], +} + + +for artifact_name in end_to_end_artifact_names_to_preprocess_steps.keys(): + metric_short_name = artifact_name.split(".")[-1] + if metric_short_name == "rouge": # rouge does not need a prefix + score_prefix = "" + else: + score_prefix = f"[score_prefix={metric_short_name}_]" + + metric = MetricPipeline( + main_score=end_to_end_artifact_name_to_main_score[artifact_name], + preprocess_steps=end_to_end_artifact_names_to_preprocess_steps[artifact_name], + metric=f"{end_to_end_artifact_names_to_main_metric[artifact_name]}{score_prefix}", + ) + + add_to_catalog(metric, artifact_name, overwrite=True) diff --git a/prepare/metrics/rag_answer_correctness.py b/prepare/metrics/rag_answer_correctness.py index 84effbccf..c4022eca5 100644 --- a/prepare/metrics/rag_answer_correctness.py +++ b/prepare/metrics/rag_answer_correctness.py @@ -49,117 +49,128 @@ def test_answer_correctness(task_data, catalog_name, global_target, instance_tar ) add_to_catalog(metric, new_catalog_name, overwrite=True) -# don't use "A" as a token because it is considered an article and removed by the token overlap -# metric -task_data = [ - { # recall is 0.5 for the first ground_truth, 0 for the second ground_truth. - # so overall its max(0.5, 0) = 0.5 - "ground_truths": ["B C", "C"], - "answer": "B", - }, - { # recall is 1/3 - "ground_truths": ["D E F"], - "answer": "B C D", - }, -] - -recall_instance_targets = [ - {"f1": 0.67, "precision": 1.0, "recall": 0.5, "score": 0.5, "score_name": "f1"}, - {"f1": 0.33, "precision": 0.33, "recall": 0.33, "score": 0.33, "score_name": "f1"}, -] - -recall_global_target = { - "f1": 0.5, - "f1_ci_high": 0.67, - "f1_ci_low": 0.33, - "precision": 0.67, - "precision_ci_high": 1.0, - "precision_ci_low": 0.33, - "recall": 0.42, - "recall_ci_high": 0.5, - "recall_ci_low": 0.33, - "score": 0.42, - "score_ci_high": 0.67, - "score_ci_low": 0.33, - "score_name": "f1", -} - - -for catalog_name, global_target, instance_targets in [ - ("metrics.rag.answer_correctness", recall_global_target, recall_instance_targets), - ("metrics.rag.recall", recall_global_target, recall_instance_targets), -]: - test_answer_correctness(task_data, catalog_name, global_target, instance_targets) - +if __name__ == "__main__": + # don't use "A" as a token because it is considered an article and removed by the token overlap + # metric + task_data = [ + { # recall is 0.5 for the first ground_truth, 0 for the second ground_truth. + # so overall its max(0.5, 0) = 0.5 + "ground_truths": ["B C", "C"], + "answer": "B", + }, + { # recall is 1/3 + "ground_truths": ["D E F"], + "answer": "B C D", + }, + ] -test_answer_correctness( - task_data, - catalog_name="metrics.rag.bert_recall", - global_target={ - "f1": 0.71, - "f1_ci_high": 0.71, - "f1_ci_low": 0.71, - "precision": 0.74, - "precision_ci_high": 0.77, - "precision_ci_low": 0.71, - "recall": 0.71, - "recall_ci_high": 0.71, - "recall_ci_low": 0.71, - "score": 0.71, - "score_ci_high": 0.71, - "score_ci_low": 0.71, - "score_name": "f1", - }, - instance_targets=[ + recall_instance_targets = [ + {"f1": 0.67, "precision": 1.0, "recall": 0.5, "score": 0.5, "score_name": "f1"}, { - "f1": 0.71, - "precision": 0.77, - "recall": 0.71, - "score": 0.71, + "f1": 0.33, + "precision": 0.33, + "recall": 0.33, + "score": 0.33, "score_name": "f1", }, - { + ] + + recall_global_target = { + "f1": 0.5, + "f1_ci_high": 0.67, + "f1_ci_low": 0.33, + "precision": 0.67, + "precision_ci_high": 1.0, + "precision_ci_low": 0.33, + "recall": 0.42, + "recall_ci_high": 0.5, + "recall_ci_low": 0.33, + "score": 0.42, + "score_ci_high": 0.67, + "score_ci_low": 0.33, + "score_name": "f1", + } + + for catalog_name, global_target, instance_targets in [ + ( + "metrics.rag.answer_correctness", + recall_global_target, + recall_instance_targets, + ), + ("metrics.rag.recall", recall_global_target, recall_instance_targets), + ]: + test_answer_correctness( + task_data, catalog_name, global_target, instance_targets + ) + + test_answer_correctness( + task_data, + catalog_name="metrics.rag.bert_recall", + global_target={ "f1": 0.71, - "precision": 0.71, + "f1_ci_high": 0.71, + "f1_ci_low": 0.71, + "precision": 0.74, + "precision_ci_high": 0.77, + "precision_ci_low": 0.71, "recall": 0.71, + "recall_ci_high": 0.71, + "recall_ci_low": 0.71, "score": 0.71, + "score_ci_high": 0.71, + "score_ci_low": 0.71, "score_name": "f1", }, - ], -) + instance_targets=[ + { + "f1": 0.71, + "precision": 0.77, + "recall": 0.71, + "score": 0.71, + "score_name": "f1", + }, + { + "f1": 0.71, + "precision": 0.71, + "recall": 0.71, + "score": 0.71, + "score_name": "f1", + }, + ], + ) -test_answer_correctness( - task_data, - catalog_name="metrics.rag.bert_recall_ml", - global_target={ - "f1": 0.86, - "f1_ci_high": 0.97, - "f1_ci_low": 0.74, - "precision": 0.86, - "precision_ci_high": 0.97, - "precision_ci_low": 0.74, - "recall": 0.86, - "recall_ci_high": 0.97, - "recall_ci_low": 0.74, - "score": 0.86, - "score_ci_high": 0.97, - "score_ci_low": 0.74, - "score_name": "f1", - }, - instance_targets=[ - { - "f1": 0.97, - "precision": 0.97, - "recall": 0.97, - "score": 0.97, + test_answer_correctness( + task_data, + catalog_name="metrics.rag.bert_recall_ml", + global_target={ + "f1": 0.86, + "f1_ci_high": 0.97, + "f1_ci_low": 0.74, + "precision": 0.86, + "precision_ci_high": 0.97, + "precision_ci_low": 0.74, + "recall": 0.86, + "recall_ci_high": 0.97, + "recall_ci_low": 0.74, + "score": 0.86, + "score_ci_high": 0.97, + "score_ci_low": 0.74, "score_name": "f1", }, - { - "f1": 0.74, - "precision": 0.74, - "recall": 0.74, - "score": 0.74, - "score_name": "f1", - }, - ], -) + instance_targets=[ + { + "f1": 0.97, + "precision": 0.97, + "recall": 0.97, + "score": 0.97, + "score_name": "f1", + }, + { + "f1": 0.74, + "precision": 0.74, + "recall": 0.74, + "score": 0.74, + "score_name": "f1", + }, + ], + ) diff --git a/prepare/metrics/rag_context_correctness.py b/prepare/metrics/rag_context_correctness.py index 3bc8d656c..fc8128b33 100644 --- a/prepare/metrics/rag_context_correctness.py +++ b/prepare/metrics/rag_context_correctness.py @@ -2,12 +2,12 @@ from unitxt.collections_operators import Wrap from unitxt.metrics import MetricPipeline from unitxt.operators import Copy, RenameFields -from unitxt.test_utils.metrics import test_evaluate, test_metric for metric_name, catalog_name in [ ("map", "metrics.rag.map"), ("mrr", "metrics.rag.mrr"), ("mrr", "metrics.rag.context_correctness"), + ("retrieval_at_k", "metrics.rag.retrieval_at_k"), ]: metric = MetricPipeline( main_score="score", @@ -21,78 +21,173 @@ ) add_to_catalog(metric, catalog_name, overwrite=True) -task_data = [ - { # MRR is 1, MAP is (1 + 2/3)/2 = 0.833 - "context_ids": ["A", "B", "C"], - "ground_truths_context_ids": ["A", "C"], - }, - { # MRR and MAP are both 0.5 - "context_ids": ["A", "B"], - "ground_truths_context_ids": ["B"], - }, -] -map_instance_targets = [ - {"map": 0.83, "score": 0.83, "score_name": "map"}, - {"map": 0.5, "score": 0.5, "score_name": "map"}, -] -mrr_instance_targets = [ - {"mrr": 1.0, "score": 1.0, "score_name": "mrr"}, - {"mrr": 0.5, "score": 0.5, "score_name": "mrr"}, -] +if __name__ == "__main__": + from unitxt.test_utils.metrics import test_evaluate, test_metric -map_global_target = { - "map": 0.67, - "map_ci_high": 0.83, - "map_ci_low": 0.5, - "score": 0.67, - "score_ci_high": 0.83, - "score_ci_low": 0.5, - "score_name": "map", -} -mrr_global_target = { - "mrr": 0.75, - "mrr_ci_high": 1.0, - "mrr_ci_low": 0.5, - "score": 0.75, - "score_ci_high": 1.0, - "score_ci_low": 0.5, - "score_name": "mrr", -} + task_data = [ + { # MRR is 1, MAP is (1 + 2/3)/2 = 0.833 + "context_ids": ["A", "B", "C"], + "ground_truths_context_ids": ["A", "C"], + }, + { # MRR and MAP are both 0.5 + "context_ids": ["A", "B"], + "ground_truths_context_ids": ["B"], + }, + ] -for catalog_name, global_target, instance_targets in [ - ("metrics.rag.map", map_global_target, map_instance_targets), - ("metrics.rag.mrr", mrr_global_target, mrr_instance_targets), - ("metrics.rag.context_correctness", mrr_global_target, mrr_instance_targets), -]: - # test the evaluate call - test_evaluate( - global_target, - instance_targets=[ - {"score": instance["score"]} for instance in instance_targets - ], - task_data=task_data, - metric_name=catalog_name, - ) + map_instance_targets = [ + {"map": 0.83, "score": 0.83, "score_name": "map"}, + {"map": 0.5, "score": 0.5, "score_name": "map"}, + ] + mrr_instance_targets = [ + {"mrr": 1.0, "score": 1.0, "score_name": "mrr"}, + {"mrr": 0.5, "score": 0.5, "score_name": "mrr"}, + ] + retrieval_at_k_instance_targets = [ + { + "match_at_1": 1.0, + "match_at_3": 1.0, + "match_at_5": 1.0, + "match_at_10": 1.0, + "match_at_20": 1.0, + "match_at_40": 1.0, + "precision_at_1": 1.0, + "precision_at_3": 0.67, + "precision_at_5": 0.67, + "precision_at_10": 0.67, + "precision_at_20": 0.67, + "precision_at_40": 0.67, + "recall_at_1": 0.5, + "recall_at_3": 1.0, + "recall_at_5": 1.0, + "recall_at_10": 1.0, + "recall_at_20": 1.0, + "recall_at_40": 1.0, + "score": 1.0, + "score_name": "match_at_1", + }, + { + "match_at_1": 0.0, + "match_at_10": 1.0, + "match_at_20": 1.0, + "match_at_3": 1.0, + "match_at_40": 1.0, + "match_at_5": 1.0, + "precision_at_1": 0.0, + "precision_at_10": 0.5, + "precision_at_20": 0.5, + "precision_at_3": 0.5, + "precision_at_40": 0.5, + "precision_at_5": 0.5, + "recall_at_1": 0.0, + "recall_at_10": 1.0, + "recall_at_20": 1.0, + "recall_at_3": 1.0, + "recall_at_40": 1.0, + "recall_at_5": 1.0, + "score": 0.0, + "score_name": "match_at_1", + }, + ] - # test using the usual metric pipeline - test_pipeline = MetricPipeline( - main_score="score", - preprocess_steps=[ - RenameFields(field_to_field={"task_data/context_ids": "context_ids"}), - RenameFields( - field_to_field={ - "task_data/ground_truths_context_ids": "ground_truths_context_ids" - } - ), - ], - metric=f"{catalog_name}", - ) - test_metric( - metric=test_pipeline, - predictions=[None, None], - references=[[], []], - instance_targets=instance_targets, - global_target=global_target, - task_data=task_data, - ) + map_global_target = { + "map": 0.67, + "map_ci_high": 0.83, + "map_ci_low": 0.5, + "score": 0.67, + "score_ci_high": 0.83, + "score_ci_low": 0.5, + "score_name": "map", + } + mrr_global_target = { + "mrr": 0.75, + "mrr_ci_high": 1.0, + "mrr_ci_low": 0.5, + "score": 0.75, + "score_ci_high": 1.0, + "score_ci_low": 0.5, + "score_name": "mrr", + } + retrieval_at_k_global_target = { + "match_at_1": 0.5, + "match_at_1_ci_high": 1.0, + "match_at_1_ci_low": 0.0, + "match_at_3": 1.0, + "match_at_5": 1.0, + "match_at_10": 1.0, + "match_at_20": 1.0, + "match_at_40": 1.0, + "precision_at_1": 0.5, + "precision_at_1_ci_high": 1.0, + "precision_at_1_ci_low": 0.0, + "precision_at_3": 0.58, + "precision_at_3_ci_high": 0.67, + "precision_at_3_ci_low": 0.5, + "precision_at_5": 0.58, + "precision_at_5_ci_high": 0.67, + "precision_at_5_ci_low": 0.5, + "precision_at_10": 0.58, + "precision_at_10_ci_high": 0.67, + "precision_at_10_ci_low": 0.5, + "precision_at_20": 0.58, + "precision_at_20_ci_high": 0.67, + "precision_at_20_ci_low": 0.5, + "precision_at_40": 0.58, + "precision_at_40_ci_high": 0.67, + "precision_at_40_ci_low": 0.5, + "recall_at_1": 0.25, + "recall_at_1_ci_high": 0.5, + "recall_at_1_ci_low": 0.0, + "recall_at_3": 1.0, + "recall_at_5": 1.0, + "recall_at_10": 1.0, + "recall_at_20": 1.0, + "recall_at_40": 1.0, + "score": 0.5, + "score_ci_high": 1.0, + "score_ci_low": 0.0, + "score_name": "match_at_1", + } + + for catalog_name, global_target, instance_targets in [ + ("metrics.rag.map", map_global_target, map_instance_targets), + ("metrics.rag.mrr", mrr_global_target, mrr_instance_targets), + ("metrics.rag.context_correctness", mrr_global_target, mrr_instance_targets), + ( + "metrics.rag.retrieval_at_k", + retrieval_at_k_global_target, + retrieval_at_k_instance_targets, + ), + ]: + # test the evaluate call + test_evaluate( + global_target, + instance_targets=[ + {"score": instance["score"]} for instance in instance_targets + ], + task_data=task_data, + metric_name=catalog_name, + ) + + # test using the usual metric pipeline + test_pipeline = MetricPipeline( + main_score="score", + preprocess_steps=[ + RenameFields(field_to_field={"task_data/context_ids": "context_ids"}), + RenameFields( + field_to_field={ + "task_data/ground_truths_context_ids": "ground_truths_context_ids" + } + ), + ], + metric=f"{catalog_name}", + ) + test_metric( + metric=test_pipeline, + predictions=[None, None], + references=[[], []], + instance_targets=instance_targets, + global_target=global_target, + task_data=task_data, + ) diff --git a/prepare/metrics/regard_metric.py b/prepare/metrics/regard_metric.py index 5e739883c..a91c120ed 100644 --- a/prepare/metrics/regard_metric.py +++ b/prepare/metrics/regard_metric.py @@ -1,3 +1,5 @@ +from typing import Any + from unitxt import add_to_catalog from unitxt.metrics import RegardMetric from unitxt.test_utils.metrics import test_metric @@ -7,7 +9,7 @@ # Regard passes task data in the legacy way using references # instead of using the 'task_data' parameters, so prediction # type and reference type are different - prediction_type="Any", + prediction_type=Any, ) predictions = [ diff --git a/prepare/metrics/rouge.py b/prepare/metrics/rouge.py index 56517b6c6..357806c54 100644 --- a/prepare/metrics/rouge.py +++ b/prepare/metrics/rouge.py @@ -2,7 +2,7 @@ from unitxt.metrics import Rouge from unitxt.test_utils.metrics import test_metric -metric = Rouge(n_resamples=None) +metric = Rouge() predictions = ["hello there", "general kenobi"] references = [["hello", "there"], ["general kenobi", "general yoda"]] @@ -28,13 +28,22 @@ global_target = { "rouge1": 0.83, + "rouge1_ci_high": 1.0, + "rouge1_ci_low": 0.67, "rouge2": 0.5, + "rouge2_ci_high": 1.0, + "rouge2_ci_low": 0.0, "rougeL": 0.83, + "rougeL_ci_high": 1.0, + "rougeL_ci_low": 0.67, "rougeLsum": 0.83, + "rougeLsum_ci_high": 1.0, + "rougeLsum_ci_low": 0.67, "score": 0.83, + "score_ci_high": 1.0, + "score_ci_low": 0.67, "score_name": "rougeL", } - outputs = test_metric( metric=metric, predictions=predictions, @@ -43,27 +52,12 @@ global_target=global_target, ) add_to_catalog(metric, "metrics.rouge", overwrite=True) - -global_target_with_confidence_intervals = global_target.copy() -global_target_with_confidence_intervals.update( - { - "rougeL_ci_low": 0.83, - "rougeL_ci_high": 0.83, - "score_ci_low": 0.83, - "score_ci_high": 0.83, - } +metric = Rouge( + __description__="This is deprecated. Use 'metrics.rouge' which also generate confidence intervals" ) -metric_with_confidence_intervals = Rouge() -outputs = test_metric( - metric=metric_with_confidence_intervals, - predictions=predictions, - references=references, - instance_targets=instance_targets, - global_target=global_target_with_confidence_intervals, -) add_to_catalog( - metric_with_confidence_intervals, + metric, "metrics.rouge_with_confidence_intervals", overwrite=True, ) diff --git a/prepare/metrics/safety_metric.py b/prepare/metrics/safety_metric.py index 6a8893a37..2397e1ec8 100644 --- a/prepare/metrics/safety_metric.py +++ b/prepare/metrics/safety_metric.py @@ -1,3 +1,5 @@ +from typing import Any + from unitxt import add_to_catalog from unitxt.metrics import SafetyMetric from unitxt.test_utils.metrics import test_metric @@ -7,7 +9,7 @@ # Safety passes task data in the legacy way using references # instead of using the 'task_data' parameters, so prediction # type and reference type are different - prediction_type="Any", + prediction_type=Any, ) predictions = ["They steal a test papers and steal a test calculator."] diff --git a/prepare/metrics/spearman.py b/prepare/metrics/spearman.py index 2bfba5960..114af2723 100644 --- a/prepare/metrics/spearman.py +++ b/prepare/metrics/spearman.py @@ -10,7 +10,7 @@ Copy(field="references/0", to_field="references"), ], metric=Spearmanr(), - prediction_type="float", + prediction_type=float, ) predictions = [1.0, 2.0, 1.0] diff --git a/prepare/metrics/unnormalized_sacrebleu.py b/prepare/metrics/unnormalized_sacrebleu.py index c91032857..e13138110 100644 --- a/prepare/metrics/unnormalized_sacrebleu.py +++ b/prepare/metrics/unnormalized_sacrebleu.py @@ -20,7 +20,7 @@ metric=HuggingfaceMetric( hf_metric_name="sacrebleu", hf_main_score="score", - prediction_type="str", + prediction_type=str, main_score="sacrebleu", scale=1.0, scaled_fields=["sacrebleu", "precisions"], diff --git a/prepare/metrics/win_rate.py b/prepare/metrics/win_rate.py new file mode 100644 index 000000000..f293cd291 --- /dev/null +++ b/prepare/metrics/win_rate.py @@ -0,0 +1,5 @@ +from unitxt.catalog import add_to_catalog +from unitxt.metrics import WeightedWinRateCorrelation + +metric = WeightedWinRateCorrelation() +add_to_catalog(metric, "metrics.weighted_win_rate_correlation", overwrite=True) diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py index 92d574965..390e62baa 100644 --- a/prepare/processors/processors.py +++ b/prepare/processors/processors.py @@ -6,6 +6,7 @@ from unitxt.processors import ( Capitalize, ConvertToBoolean, + ExtractArenaHardNumericalJudgment, ExtractMtBenchLabelJudgment, ExtractMtBenchRatingJudgment, ExtractWithRegex, @@ -112,7 +113,6 @@ overwrite=True, ) - add_to_catalog( SequentialOperator( steps=[ @@ -204,7 +204,6 @@ overwrite=True, ) - parser = FirstCharacter(field="TBD") example = " A. This is the answer." logger.info(parser.process_value(example)) @@ -244,7 +243,6 @@ overwrite=True, ) - add_to_catalog( SequentialOperator( steps=[ @@ -261,14 +259,12 @@ overwrite=True, ) - double_brackets_regex = r"\[\[(.*?)\]\]" parser = ExtractWithRegex(regex=double_brackets_regex, field="TBD") example = "A. and also B. And that is why my final answer is [[Yes]]" logger.info(parser.process_value(example)) assert parser.process_value(example) == "Yes" - add_to_catalog( SequentialOperator( steps=[ @@ -360,3 +356,16 @@ "processors.literal_eval", overwrite=True, ) + + +add_to_catalog( + SequentialOperator( + steps=[ + ExtractArenaHardNumericalJudgment( + field="prediction", + ), + ] + ), + "processors.extract_arena_hard_numerical_judgment", + overwrite=True, +) diff --git a/prepare/system_prompts/models/japanese_llama.py b/prepare/system_prompts/models/japanese_llama.py index abb7af069..9032d14b1 100644 --- a/prepare/system_prompts/models/japanese_llama.py +++ b/prepare/system_prompts/models/japanese_llama.py @@ -7,6 +7,6 @@ add_to_catalog( system_prompt, - "system_prompt.models.japanese_llama", + "system_prompts.models.japanese_llama", overwrite=True, ) diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py index 3bb243507..c6741c691 100644 --- a/prepare/tasks/classification.py +++ b/prepare/tasks/classification.py @@ -1,11 +1,13 @@ +from typing import List + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"text": "str", "text_type": "str", "class": "str"}, - reference_fields={"class": "str", "label": "List[str]"}, - prediction_type="List[str]", + input_fields={"text": str, "text_type": str, "class": str}, + reference_fields={"class": str, "label": List[str]}, + prediction_type=List[str], metrics=[ "metrics.f1_micro_multi_label", "metrics.f1_macro_multi_label", @@ -20,9 +22,9 @@ add_to_catalog( Task( - input_fields={"text": "str", "text_type": "str", "class": "str"}, - reference_fields={"class": "str", "label": "int"}, - prediction_type="float", + input_fields={"text": str, "text_type": str, "class": str}, + reference_fields={"class": str, "label": int}, + prediction_type=float, metrics=[ "metrics.accuracy", "metrics.f1_binary", @@ -37,13 +39,13 @@ add_to_catalog( Task( input_fields={ - "text": "str", - "text_type": "str", - "classes": "List[str]", - "type_of_classes": "str", + "text": str, + "text_type": str, + "classes": List[str], + "type_of_classes": str, }, - reference_fields={"labels": "List[str]"}, - prediction_type="List[str]", + reference_fields={"labels": List[str]}, + prediction_type=List[str], metrics=[ "metrics.f1_micro_multi_label", "metrics.accuracy", @@ -59,13 +61,13 @@ add_to_catalog( Task( input_fields={ - "text": "str", - "text_type": "str", - "classes": "List[str]", - "type_of_class": "str", + "text": str, + "text_type": str, + "classes": List[str], + "type_of_class": str, }, - reference_fields={"label": "str"}, - prediction_type="str", + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], augmentable_inputs=["text"], defaults={"text_type": "text"}, @@ -77,15 +79,15 @@ add_to_catalog( Task( input_fields={ - "text_a": "str", - "text_a_type": "str", - "text_b": "str", - "text_b_type": "str", - "classes": "List[str]", - "type_of_relation": "str", + "text_a": str, + "text_a_type": str, + "text_b": str, + "text_b_type": str, + "classes": List[str], + "type_of_relation": str, }, - reference_fields={"label": "str"}, - prediction_type="str", + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], augmentable_inputs=["text_a", "text_b"], defaults={"text_a_type": "first text", "text_b_type": "second text"}, @@ -98,14 +100,14 @@ add_to_catalog( Task( input_fields={ - "text": "str", - "text_type": "str", - "classes": "List[str]", - "type_of_class": "str", - "classes_descriptions": "str", + "text": str, + "text_type": str, + "classes": List[str], + "type_of_class": str, + "classes_descriptions": str, }, - reference_fields={"label": "str"}, - prediction_type="str", + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], augmentable_inputs=["text"], defaults={"text_type": "text"}, @@ -117,13 +119,13 @@ add_to_catalog( Task( input_fields={ - "text": "str", - "text_type": "str", - "classes": "List[str]", - "type_of_class": "str", + "text": str, + "text_type": str, + "classes": List[str], + "type_of_class": str, }, - reference_fields={"label": "str"}, - prediction_type="str", + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], augmentable_inputs=["text"], defaults={"text_type": "text", "type_of_class": "topic"}, diff --git a/prepare/tasks/completion/multiple_choice.py b/prepare/tasks/completion/multiple_choice.py index a057e1e3e..fce3d030a 100644 --- a/prepare/tasks/completion/multiple_choice.py +++ b/prepare/tasks/completion/multiple_choice.py @@ -1,11 +1,13 @@ +from typing import Any, Dict, List + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"context": "str", "context_type": "str", "choices": "List[str]"}, - reference_fields={"answer": "int", "choices": "List[str]"}, - prediction_type="Any", + input_fields={"context": str, "context_type": str, "choices": List[str]}, + reference_fields={"answer": int, "choices": List[str]}, + prediction_type=Any, metrics=["metrics.accuracy"], ), "tasks.completion.multiple_choice", @@ -15,12 +17,12 @@ add_to_catalog( Task( input_fields={ - "context": "str", - "context_type": "str", - "completion_type": "str", + "context": str, + "context_type": str, + "completion_type": str, }, - reference_fields={"completion": "str"}, - prediction_type="str", + reference_fields={"completion": str}, + prediction_type=str, metrics=["metrics.rouge"], ), "tasks.completion.abstractive", @@ -30,12 +32,12 @@ add_to_catalog( Task( input_fields={ - "context": "str", - "context_type": "str", - "completion_type": "str", + "context": str, + "context_type": str, + "completion_type": str, }, - reference_fields={"completion": "str"}, - prediction_type="Dict[str,Any]", + reference_fields={"completion": str}, + prediction_type=Dict[str, Any], metrics=["metrics.squad"], ), "tasks.completion.extractive", diff --git a/prepare/tasks/evaluation.py b/prepare/tasks/evaluation.py index b942da41b..c73a42598 100644 --- a/prepare/tasks/evaluation.py +++ b/prepare/tasks/evaluation.py @@ -1,10 +1,21 @@ +from typing import List + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields=["input", "input_type", "output_type", "choices", "instruction"], - reference_fields=["choices", "output_choice"], + input_fields={ + "input": str, + "input_type": str, + "output_type": str, + "choices": List[str], + "instruction": str, + }, + reference_fields={ + "choices": List[str], + "output_choice": int, + }, metrics=[ "metrics.accuracy", ], diff --git a/prepare/tasks/generation.py b/prepare/tasks/generation.py index 82519ec68..0c128285d 100644 --- a/prepare/tasks/generation.py +++ b/prepare/tasks/generation.py @@ -3,9 +3,9 @@ add_to_catalog( Task( - input_fields={"input": "str", "type_of_input": "str", "type_of_output": "str"}, - reference_fields={"output": "str"}, - prediction_type="str", + input_fields={"input": str, "type_of_input": str, "type_of_output": str}, + reference_fields={"output": str}, + prediction_type=str, metrics=["metrics.normalized_sacrebleu"], augmentable_inputs=["input"], defaults={"type_of_output": "Text"}, @@ -13,3 +13,27 @@ "tasks.generation", overwrite=True, ) + +add_to_catalog( + Task( + input_fields={ + "input_a": str, + "type_of_input_a": str, + "input_b": str, + "type_of_input_b": str, + "type_of_output": str, + }, + reference_fields={"output": str}, + prediction_type=str, + metrics=[ + "metrics.bleu", + "metrics.rouge", + "metrics.bert_score.bert_base_uncased", + "metrics.meteor", + ], + augmentable_inputs=["input_a", "input_b"], + defaults={"type_of_output": "Text"}, + ), + "tasks.generation.from_pair", + overwrite=True, +) diff --git a/prepare/tasks/language_identification.py b/prepare/tasks/language_identification.py index 0fca85998..4746a7549 100644 --- a/prepare/tasks/language_identification.py +++ b/prepare/tasks/language_identification.py @@ -3,9 +3,9 @@ add_to_catalog( Task( - input_fields={"text": "str"}, - reference_fields={"label": "str"}, - prediction_type="str", + input_fields={"text": str}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.accuracy"], ), "tasks.language_identification", diff --git a/prepare/tasks/ner.py b/prepare/tasks/ner.py index 36ce265b5..57cdf8cf5 100644 --- a/prepare/tasks/ner.py +++ b/prepare/tasks/ner.py @@ -1,16 +1,18 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"text": "str", "entity_type": "str"}, + input_fields={"text": str, "entity_type": str}, reference_fields={ - "spans_starts": "List[int]", - "spans_ends": "List[int]", - "text": "str", - "labels": "List[str]", + "spans_starts": List[int], + "spans_ends": List[int], + "text": str, + "labels": List[str], }, - prediction_type="List[Tuple[str,str]]", + prediction_type=List[Tuple[str, str]], metrics=["metrics.ner"], augmentable_inputs=["text"], ), @@ -20,14 +22,14 @@ add_to_catalog( Task( - input_fields={"text": "str", "entity_types": "List[str]"}, + input_fields={"text": str, "entity_types": List[str]}, reference_fields={ - "spans_starts": "List[int]", - "spans_ends": "List[int]", - "text": "str", - "labels": "List[str]", + "spans_starts": List[int], + "spans_ends": List[int], + "text": str, + "labels": List[str], }, - prediction_type="List[Tuple[str,str]]", + prediction_type=List[Tuple[str, str]], metrics=["metrics.ner"], augmentable_inputs=["text"], ), diff --git a/prepare/tasks/qa/multiple_choice/tasks.py b/prepare/tasks/qa/multiple_choice/tasks.py index c269199ca..53f082ccf 100644 --- a/prepare/tasks/qa/multiple_choice/tasks.py +++ b/prepare/tasks/qa/multiple_choice/tasks.py @@ -1,16 +1,18 @@ +from typing import List, Union + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "context": "str", - "context_type": "str", - "question": "str", - "choices": "List[str]", + "context": str, + "context_type": str, + "question": str, + "choices": List[str], }, - reference_fields={"answer": "Union[int,str]", "choices": "List[str]"}, - prediction_type="str", + reference_fields={"answer": Union[int, str], "choices": List[str]}, + prediction_type=str, metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_context", @@ -20,9 +22,9 @@ add_to_catalog( Task( - input_fields={"topic": "str", "question": "str", "choices": "List[str]"}, - reference_fields={"answer": "Union[int,str]", "choices": "List[str]"}, - prediction_type="str", + input_fields={"topic": str, "question": str, "choices": List[str]}, + reference_fields={"answer": Union[int, str], "choices": List[str]}, + prediction_type=str, metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_topic", @@ -31,9 +33,9 @@ add_to_catalog( Task( - input_fields={"question": "str", "choices": "List[str]"}, - reference_fields={"answer": "Union[int,str]", "choices": "List[str]"}, - prediction_type="str", + input_fields={"question": str, "choices": List[str]}, + reference_fields={"answer": Union[int, str], "choices": List[str]}, + prediction_type=str, metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.open", @@ -43,14 +45,14 @@ add_to_catalog( Task( input_fields={ - "topic": "str", - "context": "str", - "context_type": "str", - "question": "str", - "choices": "List[str]", + "topic": str, + "context": str, + "context_type": str, + "question": str, + "choices": List[str], }, - reference_fields={"answer": "Union[int,str]", "choices": "List[str]"}, - prediction_type="str", + reference_fields={"answer": Union[int, str], "choices": List[str]}, + prediction_type=str, metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_context.with_topic", diff --git a/prepare/tasks/qa/tasks.py b/prepare/tasks/qa/tasks.py index e3137ee87..e058c6fb3 100644 --- a/prepare/tasks/qa/tasks.py +++ b/prepare/tasks/qa/tasks.py @@ -1,11 +1,13 @@ +from typing import List + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"context": "str", "context_type": "str", "question": "str"}, - reference_fields={"answers": "List[str]"}, - prediction_type="str", + input_fields={"context": str, "context_type": str, "question": str}, + reference_fields={"answers": List[str]}, + prediction_type=str, metrics=["metrics.squad"], ), "tasks.qa.with_context.extractive", @@ -14,9 +16,9 @@ add_to_catalog( Task( - input_fields={"context": "str", "context_type": "str", "question": "str"}, - reference_fields={"answers": "List[str]"}, - prediction_type="str", + input_fields={"context": str, "context_type": str, "question": str}, + reference_fields={"answers": List[str]}, + prediction_type=str, metrics=["metrics.rouge"], augmentable_inputs=["context", "question"], ), @@ -26,9 +28,9 @@ add_to_catalog( Task( - input_fields={"question": "str"}, - reference_fields={"answers": "List[str]"}, - prediction_type="str", + input_fields={"question": str}, + reference_fields={"answers": List[str]}, + prediction_type=str, metrics=["metrics.rouge"], ), "tasks.qa.open", diff --git a/prepare/tasks/rag/__init__.py b/prepare/tasks/rag/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/prepare/tasks/rag/rag_end_to_end.py b/prepare/tasks/rag/rag_end_to_end.py new file mode 100644 index 000000000..a04f0c48a --- /dev/null +++ b/prepare/tasks/rag/rag_end_to_end.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, List + +from unitxt import add_to_catalog +from unitxt.blocks import Task + +add_to_catalog( + Task( + input_fields={ + "question": str, + "question_id": Any, + "metadata_field": str, + }, + reference_fields={ + "reference_answers": List[str], + "reference_contexts": List[str], + "reference_context_ids": List[str], + "is_answerable_label": bool, + }, + metrics=[ + "metrics.rag.end_to_end.answer_correctness", + "metrics.rag.end_to_end.answer_faithfulness", + "metrics.rag.end_to_end.answer_reward", + "metrics.rag.end_to_end.context_correctness", + "metrics.rag.end_to_end.context_relevance", + ], + prediction_type=Dict[str, Any], + augmentable_inputs=["question"], + ), + "tasks.rag.end_to_end", + overwrite=True, +) + +add_to_catalog( + Task( + input_fields={ + "document_id": str, + "title": str, + "passages": List[str], + "metadata_field": str, + }, + reference_fields={}, + prediction_type=Any, + metrics=[ + "metrics.rouge" + ], # We can not define an empty metric, so we gave here a simple one- although rouge is not related + ), + "tasks.rag.corpora", + overwrite=True, +) diff --git a/prepare/tasks/rag/response_generation.py b/prepare/tasks/rag/response_generation.py index 43d43b158..c7cc7b688 100644 --- a/prepare/tasks/rag/response_generation.py +++ b/prepare/tasks/rag/response_generation.py @@ -1,3 +1,5 @@ +from typing import List, Union + from unitxt import add_to_catalog from unitxt.blocks import ( Task, @@ -6,11 +8,11 @@ add_to_catalog( Task( input_fields={ - "contexts": "List[str]", - "contexts_ids": "List[int]", - "question": "str", + "contexts": List[str], + "contexts_ids": Union[List[int], List[str]], + "question": str, }, - reference_fields={"reference_answers": "List[str]"}, + reference_fields={"reference_answers": List[str]}, metrics=[ "metrics.rag.response_generation.correctness.token_overlap", "metrics.rag.response_generation.faithfullness.token_overlap", diff --git a/prepare/tasks/regression/tasks.py b/prepare/tasks/regression/tasks.py index 4aa23d762..9e5284a60 100644 --- a/prepare/tasks/regression/tasks.py +++ b/prepare/tasks/regression/tasks.py @@ -1,16 +1,18 @@ +from typing import Any, Optional + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "text": "str", - "attribute_name": "str", - "min_value": "Optional[float]", - "max_value": "Optional[float]", + "text": str, + "attribute_name": str, + "min_value": Optional[float], + "max_value": Optional[float], }, - reference_fields={"attribute_value": "float"}, - prediction_type="Any", + reference_fields={"attribute_value": float}, + prediction_type=Any, metrics=["metrics.spearman"], augmentable_inputs=["text"], ), @@ -21,14 +23,14 @@ add_to_catalog( Task( input_fields={ - "text1": "str", - "text2": "str", - "attribute_name": "str", - "min_value": "Optional[float]", - "max_value": "Optional[float]", + "text1": str, + "text2": str, + "attribute_name": str, + "min_value": Optional[float], + "max_value": Optional[float], }, - reference_fields={"attribute_value": "float"}, - prediction_type="Any", + reference_fields={"attribute_value": float}, + prediction_type=Any, metrics=["metrics.spearman"], augmentable_inputs=["text1", "text2"], ), @@ -39,14 +41,14 @@ add_to_catalog( Task( input_fields={ - "text1": "str", - "text2": "str", - "attribute_name": "str", - "min_value": "Optional[float]", - "max_value": "Optional[float]", + "text1": str, + "text2": str, + "attribute_name": str, + "min_value": Optional[float], + "max_value": Optional[float], }, - reference_fields={"attribute_value": "float"}, - prediction_type="Any", + reference_fields={"attribute_value": float}, + prediction_type=Any, metrics=["metrics.spearman"], augmentable_inputs=["text1", "text2"], defaults={"attribute_name": "similarity"}, diff --git a/prepare/tasks/response_assessment/pairwise_comparative_rating/single_turn.py b/prepare/tasks/response_assessment/pairwise_comparative_rating/single_turn.py new file mode 100644 index 000000000..5e6cad93e --- /dev/null +++ b/prepare/tasks/response_assessment/pairwise_comparative_rating/single_turn.py @@ -0,0 +1,21 @@ +from unitxt.blocks import Task +from unitxt.catalog import add_to_catalog + +add_to_catalog( + Task( + input_fields={ + "question": str, + "answer_a": str, + "answer_b": str, + "model_a": str, + "model_b": str, + }, + reference_fields={ + "answer_a_preference": int, # Positive numbers for preferring answer_a, negative for answer_b. + }, + prediction_type=int, + metrics=["metrics.weighted_win_rate_correlation", "metrics.accuracy"], + ), + "tasks.response_assessment.pairwise_comparative_rating.single_turn", + overwrite=True, +) diff --git a/prepare/tasks/response_assessment/pairwise_comparison/multi_turn.py b/prepare/tasks/response_assessment/pairwise_comparison/multi_turn.py index 02da1eac9..c4c6f259c 100644 --- a/prepare/tasks/response_assessment/pairwise_comparison/multi_turn.py +++ b/prepare/tasks/response_assessment/pairwise_comparison/multi_turn.py @@ -1,16 +1,18 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "dialog_a": "List[Tuple[str, str]]", - "dialog_b": "List[Tuple[str, str]]", + "dialog_a": List[Tuple[str, str]], + "dialog_b": List[Tuple[str, str]], }, reference_fields={ - "winner": "str" + "winner": str }, # TODO: Support and change to "Literal['choice_a', 'choice_b', 'tie']"}, - metrics=["metrics.accuracy"], + metrics=["metrics.accuracy", "metrics.f1_micro", "metrics.f1_macro"], ), "tasks.response_assessment.pairwise_comparison.multi_turn", overwrite=True, diff --git a/prepare/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.py b/prepare/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.py index b46418bb3..967e2104f 100644 --- a/prepare/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.py +++ b/prepare/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.py @@ -1,17 +1,19 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "dialog_a": "List[Tuple[str, str]]", - "dialog_b": "List[Tuple[str, str]]", - "reference_dialog": "List[Tuple[str, str]]", + "dialog_a": List[Tuple[str, str]], + "dialog_b": List[Tuple[str, str]], + "reference_dialog": List[Tuple[str, str]], }, reference_fields={ - "winner": "str" + "winner": str }, # TODO: Support and change to "Literal['choice_a', 'choice_b', 'tie']"}, - metrics=["metrics.accuracy"], + metrics=["metrics.accuracy", "metrics.f1_micro", "metrics.f1_macro"], ), "tasks.response_assessment.pairwise_comparison.multi_turn_with_reference", overwrite=True, diff --git a/prepare/tasks/response_assessment/pairwise_comparison/single_turn.py b/prepare/tasks/response_assessment/pairwise_comparison/single_turn.py index 30e440de7..629f08fa6 100644 --- a/prepare/tasks/response_assessment/pairwise_comparison/single_turn.py +++ b/prepare/tasks/response_assessment/pairwise_comparison/single_turn.py @@ -4,14 +4,14 @@ add_to_catalog( Task( input_fields={ - "question": "str", - "answer_a": "str", - "answer_b": "str", + "question": str, + "answer_a": str, + "answer_b": str, }, reference_fields={ - "winner": "str" + "winner": str }, # TODO: Support and change to "Literal['choice_a', 'choice_b', 'tie']" - metrics=["metrics.accuracy"], + metrics=["metrics.accuracy", "metrics.f1_micro", "metrics.f1_macro"], ), "tasks.response_assessment.pairwise_comparison.single_turn", overwrite=True, diff --git a/prepare/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.py b/prepare/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.py index 2e0948df8..2ec825ffe 100644 --- a/prepare/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.py +++ b/prepare/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.py @@ -4,15 +4,15 @@ add_to_catalog( Task( input_fields={ - "question": "str", - "answer_a": "str", - "answer_b": "str", - "reference_answer": "str", + "question": str, + "answer_a": str, + "answer_b": str, + "reference_answer": str, }, reference_fields={ - "winner": "str" + "winner": str }, # TODO: Support and change to "Literal['choice_a', 'choice_b', 'tie']"}, - metrics=["metrics.accuracy"], + metrics=["metrics.accuracy", "metrics.f1_micro", "metrics.f1_macro"], ), "tasks.response_assessment.pairwise_comparison.single_turn_with_reference", overwrite=True, diff --git a/prepare/tasks/response_assessment/rating/multi_turn.py b/prepare/tasks/response_assessment/rating/multi_turn.py index 4c98a89b9..5f0add8ee 100644 --- a/prepare/tasks/response_assessment/rating/multi_turn.py +++ b/prepare/tasks/response_assessment/rating/multi_turn.py @@ -1,11 +1,14 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"dialog": "List[Tuple[str, str]]"}, - reference_fields={"rating": "float"}, + input_fields={"dialog": List[Tuple[str, str]]}, + reference_fields={"rating": float}, metrics=["metrics.spearman"], + prediction_type=float, ), "tasks.response_assessment.rating.multi_turn", overwrite=True, diff --git a/prepare/tasks/response_assessment/rating/multi_turn_with_reference.py b/prepare/tasks/response_assessment/rating/multi_turn_with_reference.py index 08c2ef2d5..525093dad 100644 --- a/prepare/tasks/response_assessment/rating/multi_turn_with_reference.py +++ b/prepare/tasks/response_assessment/rating/multi_turn_with_reference.py @@ -1,14 +1,17 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "dialog": "List[Tuple[str, str]]", - "reference_dialog": "List[Tuple[str, str]]", + "dialog": List[Tuple[str, str]], + "reference_dialog": List[Tuple[str, str]], }, - reference_fields={"rating": "float"}, + reference_fields={"rating": float}, metrics=["metrics.spearman"], + prediction_type=float, ), "tasks.response_assessment.rating.multi_turn_with_reference", overwrite=True, diff --git a/prepare/tasks/response_assessment/rating/single_turn.py b/prepare/tasks/response_assessment/rating/single_turn.py index 405262aa6..0a5368802 100644 --- a/prepare/tasks/response_assessment/rating/single_turn.py +++ b/prepare/tasks/response_assessment/rating/single_turn.py @@ -3,9 +3,10 @@ add_to_catalog( Task( - input_fields={"question": "str", "answer": "str"}, - reference_fields={"rating": "float"}, + input_fields={"question": str, "answer": str}, + reference_fields={"rating": float}, metrics=["metrics.spearman"], + prediction_type=float, ), "tasks.response_assessment.rating.single_turn", overwrite=True, diff --git a/prepare/tasks/response_assessment/rating/single_turn_with_reference.py b/prepare/tasks/response_assessment/rating/single_turn_with_reference.py index c93a4114d..d82e0878b 100644 --- a/prepare/tasks/response_assessment/rating/single_turn_with_reference.py +++ b/prepare/tasks/response_assessment/rating/single_turn_with_reference.py @@ -3,9 +3,10 @@ add_to_catalog( Task( - input_fields={"question": "str", "answer": "str", "reference_answer": "str"}, - reference_fields={"rating": "float"}, + input_fields={"question": str, "answer": str, "reference_answer": str}, + reference_fields={"rating": float}, metrics=["metrics.spearman"], + prediction_type=float, ), "tasks.response_assessment.rating.single_turn_with_reference", overwrite=True, diff --git a/prepare/tasks/span_labeling.py b/prepare/tasks/span_labeling.py index 9acaa1d35..28d152b12 100644 --- a/prepare/tasks/span_labeling.py +++ b/prepare/tasks/span_labeling.py @@ -1,21 +1,23 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( input_fields={ - "text": "str", - "text_type": "str", - "class_type": "str", - "classes": "List[str]", + "text": str, + "text_type": str, + "class_type": str, + "classes": List[str], }, reference_fields={ - "text": "str", - "spans_starts": "List[int]", - "spans_ends": "List[int]", - "labels": "List[str]", + "text": str, + "spans_starts": List[int], + "spans_ends": List[int], + "labels": List[str], }, - prediction_type="List[Tuple[str,str]]", + prediction_type=List[Tuple[str, str]], metrics=[ "metrics.ner", ], diff --git a/prepare/tasks/summarization/abstractive.py b/prepare/tasks/summarization/abstractive.py index b9581a2a1..5ffaf7342 100644 --- a/prepare/tasks/summarization/abstractive.py +++ b/prepare/tasks/summarization/abstractive.py @@ -3,9 +3,9 @@ add_to_catalog( Task( - input_fields={"document": "str", "document_type": "str"}, - reference_fields={"summary": "str"}, - prediction_type="str", + input_fields={"document": str, "document_type": str}, + reference_fields={"summary": str}, + prediction_type=str, metrics=["metrics.rouge"], defaults={"document_type": "document"}, ), diff --git a/prepare/tasks/targeted_sentiment_extraction.py b/prepare/tasks/targeted_sentiment_extraction.py index 785f8a2c8..a4c698da6 100644 --- a/prepare/tasks/targeted_sentiment_extraction.py +++ b/prepare/tasks/targeted_sentiment_extraction.py @@ -1,16 +1,18 @@ +from typing import List, Tuple + from unitxt.blocks import Task from unitxt.catalog import add_to_catalog add_to_catalog( Task( - input_fields={"text": "str", "text_type": "str", "sentiment_class": "str"}, + input_fields={"text": str, "text_type": str, "sentiment_class": str}, reference_fields={ - "spans_starts": "List[int]", - "spans_ends": "List[int]", - "text": "List[str]", - "labels": "List[str]", + "spans_starts": List[int], + "spans_ends": List[int], + "text": List[str], + "labels": List[str], }, - prediction_type="List[Tuple[str,str]]", + prediction_type=List[Tuple[str, str]], metrics=["metrics.ner"], augmentable_inputs=["text"], defaults={"text_type": "text"}, @@ -21,14 +23,14 @@ add_to_catalog( Task( - input_fields={"text": "str", "text_type": "str"}, + input_fields={"text": str, "text_type": str}, reference_fields={ - "spans_starts": "List[int]", - "spans_ends": "List[int]", - "text": "List[str]", - "labels": "List[str]", + "spans_starts": List[int], + "spans_ends": List[int], + "text": List[str], + "labels": List[str], }, - prediction_type="List[Tuple[str,str]]", + prediction_type=List[Tuple[str, str]], metrics=["metrics.ner"], augmentable_inputs=["text"], defaults={"text_type": "text"}, diff --git a/prepare/tasks/translation/directed.py b/prepare/tasks/translation/directed.py index f9620cd17..aad316d08 100644 --- a/prepare/tasks/translation/directed.py +++ b/prepare/tasks/translation/directed.py @@ -4,12 +4,12 @@ add_to_catalog( Task( input_fields={ - "text": "str", - "source_language": "str", - "target_language": "str", + "text": str, + "source_language": str, + "target_language": str, }, - reference_fields={"translation": "str"}, - prediction_type="str", + reference_fields={"translation": str}, + prediction_type=str, metrics=["metrics.normalized_sacrebleu"], ), "tasks.translation.directed", diff --git a/prepare/templates/generation/generation.py b/prepare/templates/generation/generation.py index d575f5763..e47898ce4 100644 --- a/prepare/templates/generation/generation.py +++ b/prepare/templates/generation/generation.py @@ -30,3 +30,23 @@ "templates.generation.all", overwrite=True, ) + +add_to_catalog( + InputOutputTemplate( + input_format="Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}." + "\n{type_of_input_a}: \n{input_a} \n{type_of_input_b}: \n{input_b} \n{type_of_output}:", + output_format="{output}", + postprocessors=[ + "processors.take_first_non_empty_line", + "processors.lower_case_till_punc", + ], + ), + "templates.generation.from_pair.default", + overwrite=True, +) + +add_to_catalog( + TemplatesList(["templates.generation.from_pair.default"]), + "templates.generation.from_pair.all", + overwrite=True, +) diff --git a/prepare/templates/rag/end_to_end.py b/prepare/templates/rag/end_to_end.py new file mode 100644 index 000000000..1d90220b2 --- /dev/null +++ b/prepare/templates/rag/end_to_end.py @@ -0,0 +1,28 @@ +from unitxt import add_to_catalog +from unitxt.operator import SequentialOperator +from unitxt.struct_data_operators import LoadJson +from unitxt.templates import InputOutputTemplate + +add_to_catalog( + SequentialOperator( + steps=[ + LoadJson( + field="prediction", + process_every_value=False, + ), + ] + ), + "processors.load_json_predictions", + overwrite=True, +) + +add_to_catalog( + # For rag end-to-end tasks + InputOutputTemplate( + input_format="", + output_format='{{"answer": "{reference_answers}", "contexts" : ["{reference_contexts}"], "context_ids" : ["{reference_context_ids}"]}}', + postprocessors=["processors.load_json_predictions"], + ), + "templates.rag.end_to_end.json_predictions", + overwrite=True, +) diff --git a/prepare/templates/response_assessment/pairwise_comparative_rating/arena_hard.py b/prepare/templates/response_assessment/pairwise_comparative_rating/arena_hard.py new file mode 100644 index 000000000..318d51ce7 --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparative_rating/arena_hard.py @@ -0,0 +1,44 @@ +from unitxt import add_to_catalog +from unitxt.templates import PairwiseComparativeRatingTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + PairwiseComparativeRatingTemplate( + choice_a_field="answer_a", + choice_b_field="answer_b", + choice_a_id_field="model_a", + choice_b_id_field="model_b", + answer_field="answer_a_preference", + shuffle=to_shuffle, + instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" + " assistants to the user prompt displayed below. You will be given assistant A's answer and" + " assistant B's answer. Your job is to evaluate which assistant's answer is better." + "\n\nBegin your evaluation by generating your own answer to the prompt. You must provide" + " your answers before judging any answers.\n\nWhen evaluating the assistants' answers," + " compare both assistants' answers with your answer. You must identify and correct any mistakes or" + " inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant," + " and concise. Helpful means the answer correctly responds to the prompt or follows the" + " instructions. Note when user prompt has any ambiguity or more than one interpretation," + " it is more helpful and appropriate to ask for clarifications or more information from the" + " user than providing an answer based on assumptions. Relevant means all parts of the response" + " closely connect or are appropriate to what is being asked. Concise means the response is" + " clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the" + " assistant's answers when needed. Finally, identify any missing important information in" + " the assistants' answers that would be beneficial to include when responding to the user" + " prompt.\n\nAfter providing your explanation, you must output only one of the following choices" + " as your final verdict with a label:\n\n" + "1. Assistant A is significantly better: [[A>>B]]\n" + "2. Assistant A is slightly better: [[A>B]]\n" + "3. Tie, relatively the same: [[A=B]]\n" + "4. Assistant B is slightly better: [[B>A]]\n" + "5. Assistant B is significantly better: [[B>>A]]\n\n" + 'Example output: "My final verdict is tie: [[A=B]]".', + input_format="<|User Prompt|>\n{question}\n\n" + "<|The Start of Assistant A's Answer|>\n{answer_a}\n<|The End of Assistant A's Answer|>\n\n" + "<|The Start of Assistant B's Answer|>\n{answer_b}\n<|The End of Assistant B's Answer|>", + postprocessors=["processors.extract_arena_hard_numerical_judgment"], + output_format="{answer_a_preference}", + ), + f"templates.response_assessment.pairwise_comparative_rating.arena_hard{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.py b/prepare/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.py new file mode 100644 index 000000000..55155cf1b --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.py @@ -0,0 +1,55 @@ +from unitxt import add_to_catalog +from unitxt.templates import PairwiseComparativeRatingTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + PairwiseComparativeRatingTemplate( + choice_a_field="answer_a", + choice_b_field="answer_b", + choice_a_id_field="model_a", + choice_b_id_field="model_b", + answer_field="answer_a_preference", + shuffle=to_shuffle, + instruction="###Task Description:\n An instruction (might include an Input inside it), a response to evaluate," + " and a score rubric representing a evaluation criteria are given.\n" + "1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.\n" + "2. After writing a feedback, choose a better response between Response A and Response B. You should refer to the score rubric.\n" + '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (A or B)"\n' + "4. Please do not generate any other opening, closing, and explanations.\n\n" + "###Instruction:\n" + "Please act as an impartial judge and evaluate the quality of the responses provided by two AI\n" + " assistants to the user prompt displayed below. You will be given assistant A's answer and" + " assistant B's answer. Your job is to evaluate which assistant's answer is better." + "\n\nBegin your evaluation by generating your own answer to the prompt. You must provide" + " your answers before judging any answers.\n\nWhen evaluating the assistants' answers," + " compare both assistants' answers with your answer. You must identify and correct any mistakes or" + " inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant," + " and concise. Helpful means the answer correctly responds to the prompt or follows the" + " instructions. Note when user prompt has any ambiguity or more than one interpretation," + " it is more helpful and appropriate to ask for clarifications or more information from the" + " user than providing an answer based on assumptions. Relevant means all parts of the response" + " closely connect or are appropriate to what is being asked. Concise means the response is" + " clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the" + " assistant's answers when needed. Finally, identify any missing important information in" + " the assistants' answers that would be beneficial to include when responding to the user" + " prompt.", + input_format="\n{question}\n\n" + "###Response A:\n" + "{answer_a}\n\n" + "###Response B:\n" + "{answer_b}\n\n" + "###Score Rubric:\n\n" + "You must output only one of the following choices" + " as your final verdict with a label:\n\n" + "1. Assistant A is significantly better: [[A>>B]]\n" + "2. Assistant A is slightly better: [[A>B]]\n" + "3. Tie, relatively the same: [[A=B]]\n" + "4. Assistant B is slightly better: [[B>A]]\n" + "5. Assistant B is significantly better: [[B>>A]]\n\n" + 'Example output: "My final verdict is tie: [[A=B]]".', + postprocessors=["processors.extract_arena_hard_numerical_judgment"], + output_format="{answer_a_preference}", + ), + f"templates.response_assessment.pairwise_comparative_rating.prometheus_arena_hard{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.py new file mode 100644 index 000000000..ee05a4a5e --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.py @@ -0,0 +1,55 @@ +from unitxt import add_to_catalog +from unitxt.templates import DialogFieldsData, DialogPairwiseChoiceTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + DialogPairwiseChoiceTemplate( + dialog_fields=[ + DialogFieldsData( + dialog_field="dialog_a", + assistant_role_label="### Assistant A:", + user_role_label="### User:", + system_role_label="### System:", + ), + DialogFieldsData( + dialog_field="dialog_b", + assistant_role_label="### Assistant B:", + user_role_label="### User:", + system_role_label="### System:", + ), + ], + turns_separator="\n\n", + label_separator="\n", + choice_a_field="dialog_a", + choice_b_field="dialog_b", + answer_field="winner", + choice_a_label="A", + choice_b_label="B", + choice_tie_label="C", + shuffle=to_shuffle, + instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" + " assistants to the user questions. You should choose the assistant that follows the user's" + " instructions and answers the user's questions better. Your evaluation should consider factors" + " such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their" + " responses. You should focus on who provides a better answer to the second user question. " + "Begin your evaluation by comparing the responses of the two assistants and provide a short" + " explanation. Avoid any position biases and ensure that the order in which the responses were" + " presented does not influence your decision. Do not allow the length of the responses to" + " influence your evaluation. Do not favor certain names of the assistants. Be as objective as" + " possible. After providing your explanation, output your final verdict by strictly" + ' following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' + ' and "[[C]]" for a tie.\n\n', + input_format="<|The Start of Assistant A's Conversation with User|>\n\n" + "{dialog_a}\n\n" + "<|The End of Assistant A's Conversation with User|>\n\n\n" + "<|The Start of Assistant B's Conversation with User|>\n\n" + "{dialog_b}\n\n" + "<|The End of Assistant B's Conversation with User|>", + output_format="[[{winner}]]", + postprocessors=[ + r"processors.extract_mt_bench_label_judgment", + ], + ), + f"templates.response_assessment.pairwise_comparison.mt_bench_multi_turn{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.py new file mode 100644 index 000000000..9dc40772c --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.py @@ -0,0 +1,63 @@ +from unitxt.catalog import add_to_catalog +from unitxt.templates import DialogFieldsData, DialogPairwiseChoiceTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + DialogPairwiseChoiceTemplate( + dialog_fields=[ + DialogFieldsData( + dialog_field="reference_dialog", + assistant_role_label="### Reference answer:", + user_role_label="### User:", + system_role_label="### System:", + ), + DialogFieldsData( + dialog_field="dialog_a", + assistant_role_label="### Assistant A:", + user_role_label="### User:", + system_role_label="### System:", + ), + DialogFieldsData( + dialog_field="dialog_b", + assistant_role_label="### Assistant B:", + user_role_label="### User:", + system_role_label="### System:", + ), + ], + turns_separator="\n\n", + label_separator="\n", + choice_a_field="dialog_a", + choice_b_field="dialog_b", + answer_field="winner", + choice_a_label="A", + choice_b_label="B", + choice_tie_label="C", + shuffle=to_shuffle, + instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" + " assistants to the user questions. Your evaluation should consider correctness and helpfulness." + " You will be given reference answers, the assistant A's answers, the assistant B's answers." + " Your job is to determine which assistant provides correct and helpful answers to the second" + " user question. Begin your evaluation by comparing both assistants' answers with the reference" + " answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order" + " in which the responses were presented does not influence your decision. Do not allow the length" + " of the responses to influence your evaluation. Do not favor certain names of the assistants." + " Be as objective as possible. After providing your explanation, output your final verdict by" + ' strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is' + ' better, and "[[C]]" for a tie.\n\n', + input_format="<|The Start of Reference Answer|>\n\n" + "{reference_dialog}\n\n" + "<|The End of Reference Answer|>\n\n\n" + "<|The Start of Assistant A's Conversation with User|>\n\n" + "{dialog_a}\n\n" + "<|The End of Assistant A's Conversation with User|>\n\n\n" + "<|The Start of Assistant B's Conversation with User|>\n\n" + "{dialog_b}\n\n" + "<|The End of Assistant B's Conversation with User|>", + output_format="[[{winner}]]", + postprocessors=[ + r"processors.extract_mt_bench_label_judgment", + ], + ), + f"templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffle.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffle.py deleted file mode 100644 index 33e4f9b43..000000000 --- a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffle.py +++ /dev/null @@ -1,62 +0,0 @@ -from unitxt.catalog import add_to_catalog -from unitxt.templates import DialogFieldsData, DialogPairwiseChoiceTemplate - -add_to_catalog( - DialogPairwiseChoiceTemplate( - dialog_fields=[ - DialogFieldsData( - dialog_field="reference_dialog", - assistant_role_label="### Reference answer:", - user_role_label="### User:", - system_role_label="### System:", - ), - DialogFieldsData( - dialog_field="dialog_a", - assistant_role_label="### Assistant A:", - user_role_label="### User:", - system_role_label="### System:", - ), - DialogFieldsData( - dialog_field="dialog_b", - assistant_role_label="### Assistant B:", - user_role_label="### User:", - system_role_label="### System:", - ), - ], - turns_separator="\n\n", - label_separator="\n", - choice_a_field="dialog_a", - choice_b_field="dialog_b", - answer_field="winner", - choice_a_label="A", - choice_b_label="B", - choice_tie_label="C", - shuffle=True, - instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" - " assistants to the user questions. Your evaluation should consider correctness and helpfulness." - " You will be given reference answers, the assistant A's answers, the assistant B's answers." - " Your job is to determine which assistant provides correct and helpful answers to the second" - " user question. Begin your evaluation by comparing both assistants' answers with the reference" - " answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order" - " in which the responses were presented does not influence your decision. Do not allow the length" - " of the responses to influence your evaluation. Do not favor certain names of the assistants." - " Be as objective as possible. After providing your explanation, output your final verdict by" - ' strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is' - ' better, and "[[C]]" for a tie.\n\n', - input_format="<|The Start of Reference Answer|>\n\n" - "{reference_dialog}\n\n" - "<|The End of Reference Answer|>\n\n\n" - "<|The Start of Assistant A's Conversation with User|>\n\n" - "{dialog_a}\n\n" - "<|The End of Assistant A's Conversation with User|>\n\n\n" - "<|The Start of Assistant B's Conversation with User|>\n\n" - "{dialog_b}\n\n" - "<|The End of Assistant B's Conversation with User|>", - output_format="[[{winner}]]", - postprocessors=[ - r"processors.extract_mt_bench_label_judgment", - ], - ), - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffle", - overwrite=True, -) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffle.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffle.py deleted file mode 100644 index 66bd6fc49..000000000 --- a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffle.py +++ /dev/null @@ -1,54 +0,0 @@ -from unitxt import add_to_catalog -from unitxt.templates import DialogFieldsData, DialogPairwiseChoiceTemplate - -add_to_catalog( - DialogPairwiseChoiceTemplate( - dialog_fields=[ - DialogFieldsData( - dialog_field="dialog_a", - assistant_role_label="### Assistant A:", - user_role_label="### User:", - system_role_label="### System:", - ), - DialogFieldsData( - dialog_field="dialog_b", - assistant_role_label="### Assistant B:", - user_role_label="### User:", - system_role_label="### System:", - ), - ], - turns_separator="\n\n", - label_separator="\n", - choice_a_field="dialog_a", - choice_b_field="dialog_b", - answer_field="winner", - choice_a_label="A", - choice_b_label="B", - choice_tie_label="C", - shuffle=True, - instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" - " assistants to the user questions. You should choose the assistant that follows the user's" - " instructions and answers the user's questions better. Your evaluation should consider factors" - " such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their" - " responses. You should focus on who provides a better answer to the second user question. " - "Begin your evaluation by comparing the responses of the two assistants and provide a short" - " explanation. Avoid any position biases and ensure that the order in which the responses were" - " presented does not influence your decision. Do not allow the length of the responses to" - " influence your evaluation. Do not favor certain names of the assistants. Be as objective as" - " possible. After providing your explanation, output your final verdict by strictly" - ' following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' - ' and "[[C]]" for a tie.\n\n', - input_format="<|The Start of Assistant A's Conversation with User|>\n\n" - "{dialog_a}\n\n" - "<|The End of Assistant A's Conversation with User|>\n\n\n" - "<|The Start of Assistant B's Conversation with User|>\n\n" - "{dialog_b}\n\n" - "<|The End of Assistant B's Conversation with User|>", - output_format="[[{winner}]]", - postprocessors=[ - r"processors.extract_mt_bench_label_judgment", - ], - ), - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffle", - overwrite=True, -) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.py new file mode 100644 index 000000000..c34bae2a8 --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.py @@ -0,0 +1,35 @@ +from unitxt import add_to_catalog +from unitxt.templates import PairwiseChoiceTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + PairwiseChoiceTemplate( + choice_a_field="answer_a", + choice_b_field="answer_b", + answer_field="winner", + choice_a_label="A", + choice_b_label="B", + choice_tie_label="C", + shuffle=to_shuffle, + instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two" + " AI assistants to the user question displayed below. You should choose the assistant that" + " follows the user's instructions and answers the user's question better. Your evaluation should" + " consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of" + " detail of their responses. Begin your evaluation by comparing the two responses and provide a" + " short explanation. Avoid any position biases and ensure that the order in which the responses" + " were presented does not influence your decision. Do not allow the length of the responses to" + " influence your evaluation. Do not favor certain names of the assistants. Be as objective as" + " possible. After providing your explanation, output your final verdict by strictly following" + ' this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' + ' and "[[C]]" for a tie.\n\n', + input_format="[User Question]\n{question}\n\n" + "[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n" + "[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", + output_format="[[{winner}]]", + postprocessors=[ + r"processors.extract_mt_bench_label_judgment", + ], + ), + f"templates.response_assessment.pairwise_comparison.mt_bench_single_turn{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.py new file mode 100644 index 000000000..293690c99 --- /dev/null +++ b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.py @@ -0,0 +1,36 @@ +from unitxt import add_to_catalog +from unitxt.templates import PairwiseChoiceTemplate + +for to_shuffle in [True, False]: + add_to_catalog( + PairwiseChoiceTemplate( + choice_a_field="answer_a", + choice_b_field="answer_b", + answer_field="winner", + choice_a_label="A", + choice_b_label="B", + choice_tie_label="C", + shuffle=to_shuffle, + instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" + " assistants to the user question displayed below. Your evaluation should consider correctness" + " and helpfulness. You will be given a reference answer, assistant A's answer, and assistant" + " B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation" + " by comparing both assistants' answers with the reference answer. Identify and correct any" + " mistakes. Avoid any position biases and ensure that the order in which the responses were" + " presented does not influence your decision. Do not allow the length of the responses to" + " influence your evaluation. Do not favor certain names of the assistants. Be as objective" + " as possible. After providing your explanation, output your final verdict by strictly" + ' following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' + ' and "[[C]]" for a tie.\n\n', + input_format="[User Question]\n{question}\n\n" + "[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n" + "[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n" + "[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", + output_format="[[{winner}]]", + postprocessors=[ + r"processors.extract_mt_bench_label_judgment", + ], + ), + f"templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference{'_with_shuffling' if to_shuffle else ''}", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffle.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffle.py deleted file mode 100644 index 55e1712d4..000000000 --- a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffle.py +++ /dev/null @@ -1,35 +0,0 @@ -from unitxt import add_to_catalog -from unitxt.templates import PairwiseChoiceTemplate - -add_to_catalog( - PairwiseChoiceTemplate( - choice_a_field="answer_a", - choice_b_field="answer_b", - answer_field="winner", - choice_a_label="A", - choice_b_label="B", - choice_tie_label="C", - shuffle=True, - instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two AI" - " assistants to the user question displayed below. Your evaluation should consider correctness" - " and helpfulness. You will be given a reference answer, assistant A's answer, and assistant" - " B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation" - " by comparing both assistants' answers with the reference answer. Identify and correct any" - " mistakes. Avoid any position biases and ensure that the order in which the responses were" - " presented does not influence your decision. Do not allow the length of the responses to" - " influence your evaluation. Do not favor certain names of the assistants. Be as objective" - " as possible. After providing your explanation, output your final verdict by strictly" - ' following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' - ' and "[[C]]" for a tie.\n\n', - input_format="[User Question]\n{question}\n\n" - "[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n" - "[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n" - "[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", - output_format="[[{winner}]]", - postprocessors=[ - r"processors.extract_mt_bench_label_judgment", - ], - ), - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffle", - overwrite=True, -) diff --git a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffle.py b/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffle.py deleted file mode 100644 index 1a062f5e4..000000000 --- a/prepare/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffle.py +++ /dev/null @@ -1,34 +0,0 @@ -from unitxt import add_to_catalog -from unitxt.templates import PairwiseChoiceTemplate - -add_to_catalog( - PairwiseChoiceTemplate( - choice_a_field="answer_a", - choice_b_field="answer_b", - answer_field="winner", - choice_a_label="A", - choice_b_label="B", - choice_tie_label="C", - shuffle=True, - instruction="Please act as an impartial judge and evaluate the quality of the responses provided by two" - " AI assistants to the user question displayed below. You should choose the assistant that" - " follows the user's instructions and answers the user's question better. Your evaluation should" - " consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of" - " detail of their responses. Begin your evaluation by comparing the two responses and provide a" - " short explanation. Avoid any position biases and ensure that the order in which the responses" - " were presented does not influence your decision. Do not allow the length of the responses to" - " influence your evaluation. Do not favor certain names of the assistants. Be as objective as" - " possible. After providing your explanation, output your final verdict by strictly following" - ' this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,' - ' and "[[C]]" for a tie.\n\n', - input_format="[User Question]\n{question}\n\n" - "[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n" - "[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", - output_format="[[{winner}]]", - postprocessors=[ - r"processors.extract_mt_bench_label_judgment", - ], - ), - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffle", - overwrite=True, -) diff --git a/pyproject.toml b/pyproject.toml index f9ff6bf57..798be6f09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ target-version = "py38" "src/unitxt/dataset.py" = ["F811", "F401"] "src/unitxt/blocks.py" = ["F811", "F401"] "tests/library/test_loaders.py" = ["N802", "N803"] -"tests/library/test_dataclass.py" = ["F811"] +"tests/library/test_dataclass.py" = ["F811", "E731"] "src/unitxt/validate.py" = ["B024"] "src/unitxt/standard.py" = ["C901"] "src/unitxt/type_utils.py" = ["C901"] @@ -110,8 +110,8 @@ extend-immutable-calls = ["fastapi.Depends", "fastapi.params.Depends", "fastapi. "src".msg = "Use unitxt outside src/ and relative imports inside src/ and install unitxt from source with `pip install -e '.[dev]'`." [tool.codespell] -ignore-words-list = 'rouge,ot,ans,nd,cann' +ignore-words-list = 'rouge,ot,ans,nd,cann,som,tha,vie' check-filenames = true check-hidden = false regex = "(? None: class MissingArtifactTypeError(ValueError): def __init__(self, dic) -> None: message = ( - f"Missing 'type' parameter. Expected 'type' in artifact dict, got {dic}" + f"Missing '__type__' parameter. Expected 'type' in artifact dict, got {dic}" ) super().__init__(message) @@ -224,7 +224,9 @@ def _recursive_load(cls, obj): pass if cls.is_artifact_dict(obj): cls.verify_artifact_dict(obj) - return cls._class_register[obj.pop("__type__")](**obj) + artifact_class = cls._class_register[obj.pop("__type__")] + obj = artifact_class.process_data_after_load(obj) + return artifact_class(**obj) return obj @@ -289,7 +291,17 @@ def __post_init__(self): self.verify() def _to_raw_dict(self): - return {"__type__": self.__type__, **self._init_dict} + return { + "__type__": self.__type__, + **self.process_data_before_dump(self._init_dict), + } + + def process_data_before_dump(self, data): + return data + + @classmethod + def process_data_after_load(cls, data): + return data def to_json(self): data = self.to_dict() @@ -454,7 +466,6 @@ def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[Artifactory, None]]: # If Json string, first load into dictionary if isinstance(artifact_rep, str): artifact_rep = json.loads(artifact_rep) - # Load from dictionary (fails if not valid dictionary) return Artifact.from_dict(artifact_rep), None diff --git a/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json new file mode 100644 index 000000000..6f344bcd7 --- /dev/null +++ b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json @@ -0,0 +1,118 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_from_hf_space", + "space_name": "lmsys/arena-hard-browser", + "revision": "03b91ca", + "data_files": { + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl" + } + }, + "preprocess_steps": [ + { + "__type__": "rename_fields", + "field_to_field": { + "cluster": "group" + }, + "apply_to_streams": [ + "questions" + ] + }, + { + "__type__": "copy", + "field_to_field": { + "turns/0/content": "model_input" + }, + "apply_to_streams": [ + "questions" + ] + }, + { + "__type__": "set", + "fields": { + "reference_model": "gpt-4-0314" + }, + "apply_to_streams": [ + "questions" + ] + }, + { + "__type__": "copy", + "field_to_field": { + "choices/0/turns/0/content": "reference_model_output", + "choices/0/turns/0/token_len": "reference_model_output_token_len" + }, + "apply_to_streams": [ + "model_answer" + ] + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_id": "reference_model" + }, + "apply_to_streams": [ + "model_answer" + ] + }, + { + "__type__": "apply", + "function": "str.lower", + "to_field": "reference_model", + "apply_to_streams": [ + "model_answer" + ], + "_argv": [ + "reference_model" + ] + }, + { + "__type__": "join_streams", + "left_stream": "questions", + "right_stream": "model_answer", + "how": "inner", + "on": [ + "question_id", + "reference_model" + ], + "new_stream_name": "test" + }, + { + "__type__": "delete_splits", + "splits": [ + "questions", + "model_answer" + ] + }, + { + "__type__": "select_fields", + "fields": [ + "question_id", + "category", + "model_input", + "reference_model", + "reference_model_output" + ] + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_input": "input", + "category": "group", + "reference_model_output": "output" + } + }, + { + "__type__": "set", + "fields": { + "type_of_input": "prompt", + "type_of_output": "answer" + } + } + ], + "task": "tasks.generation", + "templates": [ + "templates.empty" + ] +} diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json new file mode 100644 index 000000000..a895e95b6 --- /dev/null +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json @@ -0,0 +1,83 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_from_hf_space", + "space_name": "lmsys/arena-hard-browser", + "revision": "03b91ca", + "data_files": { + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl" + } + }, + "preprocess_steps": [ + "operators.arena_hard_hf_space_processing_steps", + { + "__type__": "duplicate_split", + "split": "test", + "to_split": "game_2" + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "score_model_1_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_a", + "model_2": "model_b" + }, + "apply_to_streams": [ + "test" + ] + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_input": "question", + "model_1_output": "answer_b", + "model_2_output": "answer_a", + "score_model_2_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_b", + "model_2": "model_a" + }, + "apply_to_streams": [ + "game_2" + ] + }, + { + "__type__": "merge_streams", + "streams_to_merge": [ + "test", + "game_2" + ], + "new_stream_name": "test", + "add_origin_stream_name": false + }, + { + "__type__": "delete_splits", + "splits": [ + "game_2" + ] + }, + { + "__type__": "map_instance_values", + "mappers": { + "answer_a_preference": { + "A=B": 0, + "A>B": 1, + "A>>B": 3, + "B>A": -1, + "B>>A": -3 + } + } + } + ], + "task": "tasks.response_assessment.pairwise_comparative_rating.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling" + ] +} diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json new file mode 100644 index 000000000..4b1e4df85 --- /dev/null +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json @@ -0,0 +1,56 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_from_hf_space", + "space_name": "lmsys/arena-hard-browser", + "revision": "03b91ca", + "data_files": { + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl" + } + }, + "preprocess_steps": [ + "operators.arena_hard_hf_space_processing_steps", + { + "__type__": "map_instance_values", + "mappers": { + "score_model_1_ordered_first": { + "A=B": 0, + "A>B": 1, + "A>>B": 3, + "B>A": -1, + "B>>A": -3 + }, + "score_model_2_ordered_first": { + "A=B": 0, + "A>B": -1, + "A>>B": -3, + "B>A": 1, + "B>>A": 3 + } + } + }, + { + "__type__": "execute_expression", + "to_field": "answer_a_preference", + "expression": "int(round((score_model_1_ordered_first+score_model_2_ordered_first)/2))" + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "category": "group", + "model_1": "model_a", + "model_2": "model_b" + } + } + ], + "task": "tasks.response_assessment.pairwise_comparative_rating.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling" + ] +} diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json new file mode 100644 index 000000000..43f0351d5 --- /dev/null +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json @@ -0,0 +1,45 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_from_hf_space", + "space_name": "lmsys/arena-hard-browser", + "revision": "03b91ca", + "data_files": { + "questions": "data/arena-hard-v0.1/question.jsonl", + "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", + "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl" + } + }, + "preprocess_steps": [ + "operators.arena_hard_hf_space_processing_steps", + { + "__type__": "rename_fields", + "field_to_field": { + "model_input": "question", + "model_1_output": "answer_a", + "model_2_output": "answer_b", + "score_model_1_ordered_first": "answer_a_preference", + "category": "group", + "model_1": "model_a", + "model_2": "model_b" + } + }, + { + "__type__": "map_instance_values", + "mappers": { + "answer_a_preference": { + "A=B": 0, + "A>B": 1, + "A>>B": 3, + "B>A": -1, + "B>>A": -3 + } + } + } + ], + "task": "tasks.response_assessment.pairwise_comparative_rating.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling" + ] +} diff --git a/src/unitxt/catalog/cards/billsum.json b/src/unitxt/catalog/cards/billsum.json index 9d7f072d3..3a757e2ef 100644 --- a/src/unitxt/catalog/cards/billsum.json +++ b/src/unitxt/catalog/cards/billsum.json @@ -13,10 +13,6 @@ "test": "test" } }, - { - "__type__": "shuffle", - "page_size": 9223372036854775807 - }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/billsum_document_filtered_to_10000_chars.json b/src/unitxt/catalog/cards/billsum_document_filtered_to_10000_chars.json index 0e6328cc6..bc8f347c6 100644 --- a/src/unitxt/catalog/cards/billsum_document_filtered_to_10000_chars.json +++ b/src/unitxt/catalog/cards/billsum_document_filtered_to_10000_chars.json @@ -13,10 +13,6 @@ "test": "test" } }, - { - "__type__": "shuffle", - "page_size": 9223372036854775807 - }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/billsum_document_filtered_to_6000_chars.json b/src/unitxt/catalog/cards/billsum_document_filtered_to_6000_chars.json index 52cce293c..042f7cdde 100644 --- a/src/unitxt/catalog/cards/billsum_document_filtered_to_6000_chars.json +++ b/src/unitxt/catalog/cards/billsum_document_filtered_to_6000_chars.json @@ -13,10 +13,6 @@ "test": "test" } }, - { - "__type__": "shuffle", - "page_size": 9223372036854775807 - }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/coedit/preference.json b/src/unitxt/catalog/cards/coedit/preference.json index 9f2097c54..894019ebb 100644 --- a/src/unitxt/catalog/cards/coedit/preference.json +++ b/src/unitxt/catalog/cards/coedit/preference.json @@ -52,10 +52,14 @@ }, { "__type__": "rename_fields", - "field_to_field": { - "src": "input", - "tgt": "output_choice" - } + "field": "src", + "to_field": "input" + }, + { + "__type__": "index_of", + "search_in": "choices", + "index_of": "tgt", + "to_field": "output_choice" } ], "task": "tasks.evaluation.preference", diff --git a/src/unitxt/catalog/cards/dynamic_cards_for_llm_judges/pairwise_comparative_rating/single_turn.json b/src/unitxt/catalog/cards/dynamic_cards_for_llm_judges/pairwise_comparative_rating/single_turn.json new file mode 100644 index 000000000..216ae1f38 --- /dev/null +++ b/src/unitxt/catalog/cards/dynamic_cards_for_llm_judges/pairwise_comparative_rating/single_turn.json @@ -0,0 +1,6 @@ +{ + "__type__": "task_card", + "loader": null, + "preprocess_steps": [], + "task": "tasks.response_assessment.pairwise_comparative_rating.single_turn" +} diff --git a/src/unitxt/catalog/cards/fin_qa.json b/src/unitxt/catalog/cards/fin_qa.json index 4bbe989d4..72e2765e1 100644 --- a/src/unitxt/catalog/cards/fin_qa.json +++ b/src/unitxt/catalog/cards/fin_qa.json @@ -6,6 +6,10 @@ "streaming": false }, "preprocess_steps": [ + { + "__type__": "filter_by_expression", + "expression": "len(table) > 1" + }, { "__type__": "copy_fields", "field_to_field": [ diff --git a/src/unitxt/catalog/cards/hh_rlhf.json b/src/unitxt/catalog/cards/hh_rlhf.json index 50f88d39a..41fb6dbfb 100644 --- a/src/unitxt/catalog/cards/hh_rlhf.json +++ b/src/unitxt/catalog/cards/hh_rlhf.json @@ -68,6 +68,12 @@ "output_type": "response", "instruction": "Respond the following dialog in an helpful and harmfull way." } + }, + { + "__type__": "index_of", + "search_in": "choices", + "index_of": "output_choice", + "to_field": "output_choice" } ], "task": "tasks.evaluation.preference", diff --git a/src/unitxt/catalog/cards/mt/flores_101/ara_eng.json b/src/unitxt/catalog/cards/mt/flores_101/ara_eng.json new file mode 100644 index 000000000..2a404b4d0 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/ara_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_ara": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "arabic", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/deu_eng.json b/src/unitxt/catalog/cards/mt/flores_101/deu_eng.json new file mode 100644 index 000000000..e476f0fdc --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/deu_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_deu": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "german", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_ara.json b/src/unitxt/catalog/cards/mt/flores_101/eng_ara.json new file mode 100644 index 000000000..8d75655f3 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_ara.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_ara": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "arabic" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_deu.json b/src/unitxt/catalog/cards/mt/flores_101/eng_deu.json new file mode 100644 index 000000000..e6c6c5754 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_deu.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_deu": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "german" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_fra.json b/src/unitxt/catalog/cards/mt/flores_101/eng_fra.json new file mode 100644 index 000000000..db0b7fe7b --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_fra.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_fra": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "french" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_jpn.json b/src/unitxt/catalog/cards/mt/flores_101/eng_jpn.json new file mode 100644 index 000000000..75e4c5630 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_jpn.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_jpn": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "japanese" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_kor.json b/src/unitxt/catalog/cards/mt/flores_101/eng_kor.json new file mode 100644 index 000000000..b54eb6d41 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_kor.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_kor": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "korean" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_por.json b/src/unitxt/catalog/cards/mt/flores_101/eng_por.json new file mode 100644 index 000000000..1a87655a9 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_por.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_por": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "portuguese" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_ron.json b/src/unitxt/catalog/cards/mt/flores_101/eng_ron.json new file mode 100644 index 000000000..242d144d9 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_ron.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_ron": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "romanian" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/eng_spa.json b/src/unitxt/catalog/cards/mt/flores_101/eng_spa.json new file mode 100644 index 000000000..04f294969 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/eng_spa.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_eng": "text", + "sentence_spa": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "english", + "target_language": "spanish" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/fra_eng.json b/src/unitxt/catalog/cards/mt/flores_101/fra_eng.json new file mode 100644 index 000000000..6c8a4ba4c --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/fra_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_fra": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "french", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/jpn_eng.json b/src/unitxt/catalog/cards/mt/flores_101/jpn_eng.json new file mode 100644 index 000000000..a29b1b72a --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/jpn_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_jpn": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "japanese", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/kor_eng.json b/src/unitxt/catalog/cards/mt/flores_101/kor_eng.json new file mode 100644 index 000000000..b75499bf8 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/kor_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_kor": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "korean", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/por_eng.json b/src/unitxt/catalog/cards/mt/flores_101/por_eng.json new file mode 100644 index 000000000..12e713963 --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/por_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_por": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "portuguese", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/ron_eng.json b/src/unitxt/catalog/cards/mt/flores_101/ron_eng.json new file mode 100644 index 000000000..a078aa11d --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/ron_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_ron": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "romanian", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt/flores_101/spa_eng.json b/src/unitxt/catalog/cards/mt/flores_101/spa_eng.json new file mode 100644 index 000000000..56a57a17d --- /dev/null +++ b/src/unitxt/catalog/cards/mt/flores_101/spa_eng.json @@ -0,0 +1,33 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "gsarti/flores_101", + "name": "all" + }, + "preprocess_steps": [ + { + "__type__": "split_random_mix", + "mix": { + "validation": "dev", + "test": "devtest" + } + }, + { + "__type__": "copy", + "field_to_field": { + "sentence_spa": "text", + "sentence_eng": "translation" + } + }, + { + "__type__": "set", + "fields": { + "source_language": "spanish", + "target_language": "english" + } + } + ], + "task": "tasks.translation.directed", + "templates": "templates.translation.directed.all" +} diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json index 47073a5a8..9f5f8b74c 100644 --- a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json +++ b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.json @@ -68,6 +68,6 @@ ], "task": "tasks.response_assessment.pairwise_comparison.multi_turn", "templates": [ - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffling" ] } diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json index 7d4cae4b9..4770c41d7 100644 --- a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json +++ b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.json @@ -74,6 +74,6 @@ ], "task": "tasks.response_assessment.pairwise_comparison.multi_turn_with_reference", "templates": [ - "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffling" ] } diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json index e401769ad..1ef43960d 100644 --- a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json +++ b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.json @@ -74,6 +74,6 @@ ], "task": "tasks.response_assessment.pairwise_comparison.single_turn", "templates": [ - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffling" ] } diff --git a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json index d1209e6de..7b89e0d7f 100644 --- a/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json +++ b/src/unitxt/catalog/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.json @@ -80,6 +80,6 @@ ], "task": "tasks.response_assessment.pairwise_comparison.single_turn_with_reference", "templates": [ - "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffle" + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffling" ] } diff --git a/src/unitxt/catalog/cards/numeric_nlg.json b/src/unitxt/catalog/cards/numeric_nlg.json index 0dde571da..4e0d661bc 100644 --- a/src/unitxt/catalog/cards/numeric_nlg.json +++ b/src/unitxt/catalog/cards/numeric_nlg.json @@ -8,7 +8,8 @@ { "__type__": "set", "fields": { - "type_of_input": "table", + "type_of_input_a": "table", + "type_of_input_b": "caption", "type_of_output": "description" } }, @@ -20,16 +21,21 @@ { "__type__": "serialize_table_as_markdown", "field": "table_out", - "to_field": "input" + "to_field": "input_a" }, { "__type__": "rename_fields", "field": "description", "to_field": "output" + }, + { + "__type__": "rename_fields", + "field": "caption", + "to_field": "input_b" } ], - "task": "tasks.generation[metrics=[metrics.bleu,metrics.rouge,metrics.bert_score.bert_base_uncased,metrics.meteor]]", - "templates": "templates.generation.all", + "task": "tasks.generation.from_pair", + "templates": "templates.generation.from_pair.all", "__description__": "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers.", "__tags__": { "modality": "table", diff --git a/src/unitxt/catalog/cards/rag/benchmark/clap_nq/en.json b/src/unitxt/catalog/cards/rag/benchmark/clap_nq/en.json new file mode 100644 index 000000000..3f51389f8 --- /dev/null +++ b/src/unitxt/catalog/cards/rag/benchmark/clap_nq/en.json @@ -0,0 +1,46 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_csv", + "sep": "\t", + "files": { + "train": "https://raw.githubusercontent.com/primeqa/clapnq/main/retrieval/train/question_train_answerable.tsv", + "test": "https://raw.githubusercontent.com/primeqa/clapnq/main/retrieval/dev/question_dev_answerable.tsv" + } + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "question": "question", + "id": "question_id" + } + }, + { + "__type__": "set", + "fields": { + "reference_contexts": [], + "is_answerable_label": true, + "metadata_field": "" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "doc-id-list" + ], + "to_field": "reference_context_ids" + }, + { + "__type__": "list_field_values", + "fields": [ + "answers" + ], + "to_field": "reference_answers" + } + ], + "task": "tasks.rag.end_to_end", + "templates": { + "default": "templates.rag.end_to_end.json_predictions" + } +} diff --git a/src/unitxt/catalog/cards/rag/documents/clap_nq/en.json b/src/unitxt/catalog/cards/rag/documents/clap_nq/en.json new file mode 100644 index 000000000..0176078d7 --- /dev/null +++ b/src/unitxt/catalog/cards/rag/documents/clap_nq/en.json @@ -0,0 +1,40 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_csv", + "sep": "\t", + "files": { + "train": "https://media.githubusercontent.com/media/primeqa/clapnq/main/retrieval/passages.tsv" + } + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "id": "document_id", + "title": "title" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "text" + ], + "to_field": "passages" + }, + { + "__type__": "set", + "fields": { + "metadata_field": "" + } + } + ], + "task": "tasks.rag.corpora", + "templates": { + "empty": { + "__type__": "input_output_template", + "input_format": "", + "output_format": "" + } + } +} diff --git a/src/unitxt/catalog/cards/rag/response_generation/clapnq.json b/src/unitxt/catalog/cards/rag/response_generation/clapnq.json index 7f641f52e..4a4ce37ec 100644 --- a/src/unitxt/catalog/cards/rag/response_generation/clapnq.json +++ b/src/unitxt/catalog/cards/rag/response_generation/clapnq.json @@ -25,6 +25,26 @@ "fields": { "contexts_ids": [] } + }, + { + "__type__": "map_instance_values", + "mappers": { + "reference_answers": { + "['']": [ + "I'm sorry, I cannot answer this question based on the context.", + "The answer is not in the text provided.", + "Unanswerable.", + "The provided context does not contain the information needed to answer this question.", + "There is not enough information in the text to answer this question.", + "The text does not provide an answer to this question.", + "Based on the context, an answer cannot be determined.", + "The answer to this question is not available in the provided context.", + "This question cannot be answered with the given information.", + "Insufficient context to provide an answer." + ] + } + }, + "strict": false } ], "task": "tasks.rag.response_generation", diff --git a/src/unitxt/catalog/cards/reward_bench/chat.json b/src/unitxt/catalog/cards/reward_bench/chat.json new file mode 100644 index 000000000..45c46b9ac --- /dev/null +++ b/src/unitxt/catalog/cards/reward_bench/chat.json @@ -0,0 +1,48 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "allenai/reward-bench", + "split": "filtered" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "filtered": "test" + } + }, + { + "__type__": "rename_fields", + "field_to_field": { + "prompt": "question", + "chosen": "answer_a", + "rejected": "answer_b", + "subset": "group" + } + }, + { + "__type__": "set", + "fields": { + "winner": "choice_a" + } + }, + { + "__type__": "filter_by_condition", + "values": { + "group": [ + "alpacaeval-easy", + "alpacaeval-length", + "alpacaeval-hard", + "mt-bench-easy", + "mt-bench-med" + ] + }, + "condition": "in" + } + ], + "task": "tasks.response_assessment.pairwise_comparison.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn" + ] +} diff --git a/src/unitxt/catalog/cards/reward_bench/chat_hard.json b/src/unitxt/catalog/cards/reward_bench/chat_hard.json new file mode 100644 index 000000000..bfebb8793 --- /dev/null +++ b/src/unitxt/catalog/cards/reward_bench/chat_hard.json @@ -0,0 +1,49 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "allenai/reward-bench", + "split": "filtered" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "filtered": "test" + } + }, + { + "__type__": "rename_fields", + "field_to_field": { + "prompt": "question", + "chosen": "answer_a", + "rejected": "answer_b", + "subset": "group" + } + }, + { + "__type__": "set", + "fields": { + "winner": "choice_a" + } + }, + { + "__type__": "filter_by_condition", + "values": { + "group": [ + "mt-bench-hard", + "llmbar-natural", + "llmbar-adver-neighbor", + "llmbar-adver-GPTInst", + "llmbar-adver-GPTOut", + "llmbar-adver-manual" + ] + }, + "condition": "in" + } + ], + "task": "tasks.response_assessment.pairwise_comparison.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn" + ] +} diff --git a/src/unitxt/catalog/cards/reward_bench/reasoning.json b/src/unitxt/catalog/cards/reward_bench/reasoning.json new file mode 100644 index 000000000..69976c0e3 --- /dev/null +++ b/src/unitxt/catalog/cards/reward_bench/reasoning.json @@ -0,0 +1,50 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "allenai/reward-bench", + "split": "filtered" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "filtered": "test" + } + }, + { + "__type__": "rename_fields", + "field_to_field": { + "prompt": "question", + "chosen": "answer_a", + "rejected": "answer_b", + "subset": "group" + } + }, + { + "__type__": "set", + "fields": { + "winner": "choice_a" + } + }, + { + "__type__": "filter_by_condition", + "values": { + "group": [ + "math-prm", + "hep-cpp", + "hep-go", + "hep-java", + "hep-js", + "hep-python", + "hep-rust" + ] + }, + "condition": "in" + } + ], + "task": "tasks.response_assessment.pairwise_comparison.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn" + ] +} diff --git a/src/unitxt/catalog/cards/reward_bench/safety.json b/src/unitxt/catalog/cards/reward_bench/safety.json new file mode 100644 index 000000000..d983b482e --- /dev/null +++ b/src/unitxt/catalog/cards/reward_bench/safety.json @@ -0,0 +1,48 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "allenai/reward-bench", + "split": "filtered" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "filtered": "test" + } + }, + { + "__type__": "rename_fields", + "field_to_field": { + "prompt": "question", + "chosen": "answer_a", + "rejected": "answer_b", + "subset": "group" + } + }, + { + "__type__": "set", + "fields": { + "winner": "choice_a" + } + }, + { + "__type__": "filter_by_condition", + "values": { + "group": [ + "refusals-dangerous", + "refusals-offensive", + "xstest-should-refuse", + "xstest-should-respond", + "donotanswer" + ] + }, + "condition": "in" + } + ], + "task": "tasks.response_assessment.pairwise_comparison.single_turn", + "templates": [ + "templates.response_assessment.pairwise_comparison.mt_bench_single_turn" + ] +} diff --git a/src/unitxt/catalog/cards/tldr.json b/src/unitxt/catalog/cards/tldr.json index 54d259c8c..9832cfbaa 100644 --- a/src/unitxt/catalog/cards/tldr.json +++ b/src/unitxt/catalog/cards/tldr.json @@ -9,8 +9,9 @@ { "__type__": "split_random_mix", "mix": { - "train": "train[50%]", - "test": "train[50%]" + "train": "train[70%]", + "validation": "train[15%]", + "test": "train[15%]" } }, { diff --git a/src/unitxt/catalog/cards/tldr_document_filtered_to_10000_chars.json b/src/unitxt/catalog/cards/tldr_document_filtered_to_10000_chars.json index 60f6de54a..e58be01fe 100644 --- a/src/unitxt/catalog/cards/tldr_document_filtered_to_10000_chars.json +++ b/src/unitxt/catalog/cards/tldr_document_filtered_to_10000_chars.json @@ -9,8 +9,9 @@ { "__type__": "split_random_mix", "mix": { - "train": "train[50%]", - "test": "train[50%]" + "train": "train[70%]", + "validation": "train[15%]", + "test": "train[15%]" } }, { diff --git a/src/unitxt/catalog/cards/tldr_document_filtered_to_6000_chars.json b/src/unitxt/catalog/cards/tldr_document_filtered_to_6000_chars.json index caf08f8aa..6bba48b2a 100644 --- a/src/unitxt/catalog/cards/tldr_document_filtered_to_6000_chars.json +++ b/src/unitxt/catalog/cards/tldr_document_filtered_to_6000_chars.json @@ -9,8 +9,9 @@ { "__type__": "split_random_mix", "mix": { - "train": "train[50%]", - "test": "train[50%]" + "train": "train[70%]", + "validation": "train[15%]", + "test": "train[15%]" } }, { diff --git a/src/unitxt/catalog/cards/universal_ner/ceb/gja.json b/src/unitxt/catalog/cards/universal_ner/ceb/gja.json index e67fbe1bd..b65c5f105 100644 --- a/src/unitxt/catalog/cards/universal_ner/ceb/gja.json +++ b/src/unitxt/catalog/cards/universal_ner/ceb/gja.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/da/ddt.json b/src/unitxt/catalog/cards/universal_ner/da/ddt.json index 8d55a6870..53f0e36c7 100644 --- a/src/unitxt/catalog/cards/universal_ner/da/ddt.json +++ b/src/unitxt/catalog/cards/universal_ner/da/ddt.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/de/pud.json b/src/unitxt/catalog/cards/universal_ner/de/pud.json index 5caebc44f..e3378f391 100644 --- a/src/unitxt/catalog/cards/universal_ner/de/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/de/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/en/ewt.json b/src/unitxt/catalog/cards/universal_ner/en/ewt.json index 2ef63c9d9..17b81982e 100644 --- a/src/unitxt/catalog/cards/universal_ner/en/ewt.json +++ b/src/unitxt/catalog/cards/universal_ner/en/ewt.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/en/pud.json b/src/unitxt/catalog/cards/universal_ner/en/pud.json index 37ec59245..be35437ba 100644 --- a/src/unitxt/catalog/cards/universal_ner/en/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/en/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/hr/set.json b/src/unitxt/catalog/cards/universal_ner/hr/set.json index 3f54d2152..753b357dd 100644 --- a/src/unitxt/catalog/cards/universal_ner/hr/set.json +++ b/src/unitxt/catalog/cards/universal_ner/hr/set.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/pt/bosque.json b/src/unitxt/catalog/cards/universal_ner/pt/bosque.json index 2ef67245b..9a7a160cd 100644 --- a/src/unitxt/catalog/cards/universal_ner/pt/bosque.json +++ b/src/unitxt/catalog/cards/universal_ner/pt/bosque.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/pt/pud.json b/src/unitxt/catalog/cards/universal_ner/pt/pud.json index b0ac0f23b..4e955c591 100644 --- a/src/unitxt/catalog/cards/universal_ner/pt/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/pt/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/ru/pud.json b/src/unitxt/catalog/cards/universal_ner/ru/pud.json index d796cb200..72861bfc8 100644 --- a/src/unitxt/catalog/cards/universal_ner/ru/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/ru/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/sk/snk.json b/src/unitxt/catalog/cards/universal_ner/sk/snk.json index 2ef021d5c..b4829d8cc 100644 --- a/src/unitxt/catalog/cards/universal_ner/sk/snk.json +++ b/src/unitxt/catalog/cards/universal_ner/sk/snk.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/sr/set.json b/src/unitxt/catalog/cards/universal_ner/sr/set.json index 29500f0fe..675910962 100644 --- a/src/unitxt/catalog/cards/universal_ner/sr/set.json +++ b/src/unitxt/catalog/cards/universal_ner/sr/set.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/sv/pud.json b/src/unitxt/catalog/cards/universal_ner/sv/pud.json index a34140d6c..17022be07 100644 --- a/src/unitxt/catalog/cards/universal_ner/sv/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/sv/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/sv/talbanken.json b/src/unitxt/catalog/cards/universal_ner/sv/talbanken.json index 57c31aab3..ed81d3eaf 100644 --- a/src/unitxt/catalog/cards/universal_ner/sv/talbanken.json +++ b/src/unitxt/catalog/cards/universal_ner/sv/talbanken.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/tl/trg.json b/src/unitxt/catalog/cards/universal_ner/tl/trg.json index ccc4f413d..fb279459d 100644 --- a/src/unitxt/catalog/cards/universal_ner/tl/trg.json +++ b/src/unitxt/catalog/cards/universal_ner/tl/trg.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json b/src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json index 3a7b54b6b..12534ee51 100644 --- a/src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json +++ b/src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/zh/gsd.json b/src/unitxt/catalog/cards/universal_ner/zh/gsd.json index 9f1c531ae..ac4a415c0 100644 --- a/src/unitxt/catalog/cards/universal_ner/zh/gsd.json +++ b/src/unitxt/catalog/cards/universal_ner/zh/gsd.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json b/src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json index e30bad510..aa215325b 100644 --- a/src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json +++ b/src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/universal_ner/zh/pud.json b/src/unitxt/catalog/cards/universal_ner/zh/pud.json index 30daa799f..0e5aa01f9 100644 --- a/src/unitxt/catalog/cards/universal_ner/zh/pud.json +++ b/src/unitxt/catalog/cards/universal_ner/zh/pud.json @@ -9,6 +9,10 @@ ] }, "preprocess_steps": [ + { + "__type__": "shuffle", + "page_size": 9223372036854775807 + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard.json new file mode 100644 index 000000000..1cad41ef5 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "ibm_gen_ai_inference_engine", + "model_name": "meta-llama/llama-3-70b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_70b_instruct_ibm_genai_template_arena_hard" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard_with_shuffling.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard_with_shuffling.json new file mode 100644 index 000000000..fbea383b9 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct_ibm_genai_template_arena_hard_with_shuffling.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "ibm_gen_ai_inference_engine", + "model_name": "meta-llama/llama-3-70b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_70b_instruct_ibm_genai_template_arena_hard_with_shuffling" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard.json new file mode 100644 index 000000000..b10c5a6fc --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "ibm_gen_ai_inference_engine", + "model_name": "meta-llama/llama-3-8b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_8b_instruct_ibm_genai_template_arena_hard" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling.json new file mode 100644 index 000000000..25b15fa09 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "ibm_gen_ai_inference_engine", + "model_name": "meta-llama/llama-3-8b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json index d5cc8a9c2..f2eb862e5 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.generic_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json index a446726c5..0e53ebc40 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.generic_single_turn_with_reference", "task": "rating.single_turn_with_reference", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json index 562dc1782..397f4c20e 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json index 86ad3258f..7e6d7a5ea 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-8b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json index bf0e0c4cd..ba087faf1 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-70b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json index 33231da97..a40caf7b8 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json @@ -3,7 +3,8 @@ "inference_model": { "__type__": "ibm_gen_ai_inference_engine", "model_name": "meta-llama/llama-3-8b-instruct", - "max_new_tokens": 252 + "max_new_tokens": 252, + "random_seed": 42 }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn", diff --git a/src/unitxt/catalog/metrics/meteor.json b/src/unitxt/catalog/metrics/meteor.json index 293c6eae8..1b36f4d7f 100644 --- a/src/unitxt/catalog/metrics/meteor.json +++ b/src/unitxt/catalog/metrics/meteor.json @@ -1,6 +1,3 @@ { - "__type__": "huggingface_metric", - "hf_metric_name": "meteor", - "main_score": "meteor", - "prediction_type": "str" + "__type__": "meteor" } diff --git a/src/unitxt/catalog/metrics/normalized_sacrebleu.json b/src/unitxt/catalog/metrics/normalized_sacrebleu.json index 7b90e60db..7eb23faff 100644 --- a/src/unitxt/catalog/metrics/normalized_sacrebleu.json +++ b/src/unitxt/catalog/metrics/normalized_sacrebleu.json @@ -1,6 +1,7 @@ { "__type__": "metric_pipeline", "main_score": "sacrebleu", + "prediction_type": "str", "preprocess_steps": [ { "__type__": "copy", diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness.json new file mode 100644 index 000000000..6b55c10e3 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/answer", + "prediction" + ] + ] + }, + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.token_overlap[score_prefix=answer_correctness_]" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_faithfulness.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_faithfulness.json new file mode 100644 index 000000000..76d396388 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_faithfulness.json @@ -0,0 +1,25 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/contexts", + "references" + ] + ] + }, + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/answer", + "prediction" + ] + ] + } + ], + "metric": "metrics.token_overlap[score_prefix=answer_faithfulness_]" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_reward.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_reward.json new file mode 100644 index 000000000..52a336c81 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_reward.json @@ -0,0 +1,23 @@ +{ + "__type__": "metric_pipeline", + "main_score": "score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/answer", + "prediction" + ] + ] + }, + { + "__type__": "list_field_values", + "fields": [ + "task_data/question" + ], + "to_field": "references" + } + ], + "metric": "metrics.reward.deberta_v3_large_v2[score_prefix=answer_reward_]" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness.json new file mode 100644 index 000000000..13e5bdae9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness.json @@ -0,0 +1,23 @@ +{ + "__type__": "metric_pipeline", + "main_score": "score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/context_ids", + "prediction" + ] + ] + }, + { + "__type__": "list_field_values", + "fields": [ + "task_data/reference_context_ids" + ], + "to_field": "references" + } + ], + "metric": "metrics.mrr[score_prefix=context_correctness_]" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance.json new file mode 100644 index 000000000..cb281b2f7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance.json @@ -0,0 +1,25 @@ +{ + "__type__": "metric_pipeline", + "main_score": "score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": [ + [ + "prediction/contexts", + "references" + ] + ] + }, + { + "__type__": "copy", + "field_to_field": [ + [ + "task_data/question", + "prediction" + ] + ] + } + ], + "metric": "metrics.perplexity_q.flan_t5_small[score_prefix=context_relevance_]" +} diff --git a/src/unitxt/catalog/metrics/rag/retrieval_at_k.json b/src/unitxt/catalog/metrics/rag/retrieval_at_k.json new file mode 100644 index 000000000..2dc1a82a5 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/retrieval_at_k.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "score", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "ground_truths_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.retrieval_at_k" +} diff --git a/src/unitxt/catalog/metrics/rouge.json b/src/unitxt/catalog/metrics/rouge.json index 448f21f09..82844033a 100644 --- a/src/unitxt/catalog/metrics/rouge.json +++ b/src/unitxt/catalog/metrics/rouge.json @@ -1,4 +1,3 @@ { - "__type__": "rouge", - "n_resamples": null + "__type__": "rouge" } diff --git a/src/unitxt/catalog/metrics/rouge_with_confidence_intervals.json b/src/unitxt/catalog/metrics/rouge_with_confidence_intervals.json index 82844033a..85da472ec 100644 --- a/src/unitxt/catalog/metrics/rouge_with_confidence_intervals.json +++ b/src/unitxt/catalog/metrics/rouge_with_confidence_intervals.json @@ -1,3 +1,4 @@ { - "__type__": "rouge" + "__type__": "rouge", + "__description__": "This is deprecated. Use 'metrics.rouge' which also generate confidence intervals" } diff --git a/src/unitxt/catalog/metrics/weighted_win_rate_correlation.json b/src/unitxt/catalog/metrics/weighted_win_rate_correlation.json new file mode 100644 index 000000000..d684b395e --- /dev/null +++ b/src/unitxt/catalog/metrics/weighted_win_rate_correlation.json @@ -0,0 +1,3 @@ +{ + "__type__": "weighted_win_rate_correlation" +} diff --git a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json new file mode 100644 index 000000000..2ae8625e3 --- /dev/null +++ b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json @@ -0,0 +1,229 @@ +{ + "__type__": "sequential_operator", + "steps": [ + { + "__type__": "rename_fields", + "field_to_field": { + "cluster": "group" + }, + "apply_to_streams": [ + "questions" + ] + }, + { + "__type__": "copy", + "field_to_field": { + "turns/0/content": "model_input" + }, + "apply_to_streams": [ + "questions" + ] + }, + { + "__type__": "copy", + "field_to_field": { + "choices/0/turns/0/content": "model_output", + "choices/0/turns/0/token_len": "model_output_token_len" + }, + "apply_to_streams": [ + "model_answer" + ] + }, + { + "__type__": "apply", + "function": "str.lower", + "to_field": "model_id", + "apply_to_streams": [ + "model_answer" + ], + "_argv": [ + "model_id" + ] + }, + { + "__type__": "copy", + "field_to_field": { + "games/0/user_prompt": "judge_input_model_1_ordered_first", + "games/1/user_prompt": "judge_input_model_2_ordered_first", + "games/0/judgment": "judge_output_model_1_ordered_first", + "games/1/judgment": "judge_output_model_2_ordered_first", + "games/0/score": "score_model_1_ordered_first", + "games/1/score": "score_model_2_ordered_first" + }, + "apply_to_streams": [ + "judgment" + ] + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model": "model_2", + "judge": "judge_model_id" + }, + "apply_to_streams": [ + "judgment" + ] + }, + { + "__type__": "set", + "fields": { + "model_1": "gpt-4-0314" + }, + "apply_to_streams": [ + "judgment" + ] + }, + { + "__type__": "apply", + "function": "str", + "to_field": "judge_input_model_1_ordered_first", + "apply_to_streams": [ + "judgment" + ], + "_argv": [ + "judge_input_model_1_ordered_first" + ] + }, + { + "__type__": "apply", + "function": "str", + "to_field": "judge_input_model_2_ordered_first", + "apply_to_streams": [ + "judgment" + ], + "_argv": [ + "judge_input_model_2_ordered_first" + ] + }, + { + "__type__": "apply", + "function": "str.lower", + "to_field": "model_1", + "apply_to_streams": [ + "judgment" + ], + "_argv": [ + "model_1" + ] + }, + { + "__type__": "apply", + "function": "str.lower", + "to_field": "model_2", + "apply_to_streams": [ + "judgment" + ], + "_argv": [ + "model_2" + ] + }, + { + "__type__": "filter_by_condition", + "values": { + "score_model_1_ordered_first": [ + "A=B", + "A>B", + "A>>B", + "B>A", + "B>>A" + ], + "score_model_2_ordered_first": [ + "A=B", + "A>B", + "A>>B", + "B>A", + "B>>A" + ] + }, + "condition": "in", + "apply_to_streams": [ + "judgment" + ] + }, + { + "__type__": "join_streams", + "left_stream": "questions", + "right_stream": "judgment", + "how": "inner", + "on": [ + "question_id" + ], + "new_stream_name": "merged_stream" + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_id": "model_1", + "model_output": "model_1_output" + }, + "apply_to_streams": [ + "model_answer" + ] + }, + { + "__type__": "join_streams", + "left_stream": "merged_stream", + "right_stream": "model_answer", + "how": "inner", + "on": [ + "question_id", + "model_1" + ], + "new_stream_name": "merged_stream" + }, + { + "__type__": "rename_fields", + "field_to_field": { + "model_1": "model_2", + "model_1_output": "model_2_output" + }, + "apply_to_streams": [ + "model_answer" + ] + }, + { + "__type__": "join_streams", + "left_stream": "merged_stream", + "right_stream": "model_answer", + "how": "inner", + "on": [ + "question_id", + "model_2" + ], + "new_stream_name": "merged_stream" + }, + { + "__type__": "delete_splits", + "splits": [ + "questions", + "model_answer", + "judgment" + ] + }, + { + "__type__": "rename_splits", + "mapper": { + "merged_stream": "test" + } + }, + { + "__type__": "select_fields", + "fields": [ + "question_id", + "category", + "model_input", + "model_1", + "model_2", + "judge_model_id", + "model_1_output", + "model_2_output", + "score_model_1_ordered_first", + "score_model_2_ordered_first", + "judge_input_model_1_ordered_first", + "judge_input_model_2_ordered_first", + "judge_output_model_1_ordered_first", + "judge_output_model_2_ordered_first" + ] + } + ] +} diff --git a/src/unitxt/catalog/operators/mt_bench/pairwise_hf_space_processing_steps.json b/src/unitxt/catalog/operators/mt_bench/pairwise_hf_space_processing_steps.json index 6702b0f97..beb71a474 100644 --- a/src/unitxt/catalog/operators/mt_bench/pairwise_hf_space_processing_steps.json +++ b/src/unitxt/catalog/operators/mt_bench/pairwise_hf_space_processing_steps.json @@ -13,7 +13,6 @@ { "__type__": "rename_fields", "field_to_field": { - "judge": "judge_model_id", "g1_user_prompt": "judge_input_model_1_ordered_first", "g2_user_prompt": "judge_input_model_2_ordered_first", "g1_judgment": "judge_output_model_1_ordered_first", @@ -25,6 +24,14 @@ "judgment" ] }, + { + "__type__": "copy", + "field": "judge/0", + "to_field": "judge_model_id", + "apply_to_streams": [ + "judgment" + ] + }, { "__type__": "apply", "function": "str.lower", diff --git a/src/unitxt/catalog/operators/mt_bench/rating_hf_space_processing_steps.json b/src/unitxt/catalog/operators/mt_bench/rating_hf_space_processing_steps.json index 1e10589a8..05b4e4c4f 100644 --- a/src/unitxt/catalog/operators/mt_bench/rating_hf_space_processing_steps.json +++ b/src/unitxt/catalog/operators/mt_bench/rating_hf_space_processing_steps.json @@ -14,7 +14,6 @@ "__type__": "rename_fields", "field_to_field": { "model": "model_id", - "judge": "judge_model_id", "user_prompt": "judge_input", "judgment": "judge_output" }, @@ -22,6 +21,14 @@ "judgment" ] }, + { + "__type__": "copy", + "field": "judge/0", + "to_field": "judge_model_id", + "apply_to_streams": [ + "judgment" + ] + }, { "__type__": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/processors/extract_arena_hard_numerical_judgment.json b/src/unitxt/catalog/processors/extract_arena_hard_numerical_judgment.json new file mode 100644 index 000000000..b424d38d3 --- /dev/null +++ b/src/unitxt/catalog/processors/extract_arena_hard_numerical_judgment.json @@ -0,0 +1,9 @@ +{ + "__type__": "sequential_operator", + "steps": [ + { + "__type__": "extract_arena_hard_numerical_judgment", + "field": "prediction" + } + ] +} diff --git a/src/unitxt/catalog/processors/load_json_predictions.json b/src/unitxt/catalog/processors/load_json_predictions.json new file mode 100644 index 000000000..90a2257f1 --- /dev/null +++ b/src/unitxt/catalog/processors/load_json_predictions.json @@ -0,0 +1,10 @@ +{ + "__type__": "sequential_operator", + "steps": [ + { + "__type__": "load_json", + "field": "prediction", + "process_every_value": false + } + ] +} diff --git a/src/unitxt/catalog/system_prompt/models/japanese_llama.json b/src/unitxt/catalog/system_prompts/models/japanese_llama.json similarity index 100% rename from src/unitxt/catalog/system_prompt/models/japanese_llama.json rename to src/unitxt/catalog/system_prompts/models/japanese_llama.json diff --git a/src/unitxt/catalog/tasks/completion/extractive.json b/src/unitxt/catalog/tasks/completion/extractive.json index 69ba70e17..7920c6204 100644 --- a/src/unitxt/catalog/tasks/completion/extractive.json +++ b/src/unitxt/catalog/tasks/completion/extractive.json @@ -8,7 +8,7 @@ "reference_fields": { "completion": "str" }, - "prediction_type": "Dict[str,Any]", + "prediction_type": "Dict[str, Any]", "metrics": [ "metrics.squad" ] diff --git a/src/unitxt/catalog/tasks/evaluation/preference.json b/src/unitxt/catalog/tasks/evaluation/preference.json index d6488a2fa..08c1ec216 100644 --- a/src/unitxt/catalog/tasks/evaluation/preference.json +++ b/src/unitxt/catalog/tasks/evaluation/preference.json @@ -1,16 +1,16 @@ { "__type__": "task", - "input_fields": [ - "input", - "input_type", - "output_type", - "choices", - "instruction" - ], - "reference_fields": [ - "choices", - "output_choice" - ], + "input_fields": { + "input": "str", + "input_type": "str", + "output_type": "str", + "choices": "List[str]", + "instruction": "str" + }, + "reference_fields": { + "choices": "List[str]", + "output_choice": "int" + }, "metrics": [ "metrics.accuracy" ], diff --git a/src/unitxt/catalog/tasks/generation/from_pair.json b/src/unitxt/catalog/tasks/generation/from_pair.json new file mode 100644 index 000000000..d3f59b95d --- /dev/null +++ b/src/unitxt/catalog/tasks/generation/from_pair.json @@ -0,0 +1,27 @@ +{ + "__type__": "task", + "input_fields": { + "input_a": "str", + "type_of_input_a": "str", + "input_b": "str", + "type_of_input_b": "str", + "type_of_output": "str" + }, + "reference_fields": { + "output": "str" + }, + "prediction_type": "str", + "metrics": [ + "metrics.bleu", + "metrics.rouge", + "metrics.bert_score.bert_base_uncased", + "metrics.meteor" + ], + "augmentable_inputs": [ + "input_a", + "input_b" + ], + "defaults": { + "type_of_output": "Text" + } +} diff --git a/src/unitxt/catalog/tasks/ner/all_entity_types.json b/src/unitxt/catalog/tasks/ner/all_entity_types.json index 942bbd9ce..ae88b535e 100644 --- a/src/unitxt/catalog/tasks/ner/all_entity_types.json +++ b/src/unitxt/catalog/tasks/ner/all_entity_types.json @@ -10,7 +10,7 @@ "text": "str", "labels": "List[str]" }, - "prediction_type": "List[Tuple[str,str]]", + "prediction_type": "List[Tuple[str, str]]", "metrics": [ "metrics.ner" ], diff --git a/src/unitxt/catalog/tasks/ner/single_entity_type.json b/src/unitxt/catalog/tasks/ner/single_entity_type.json index 72a509ff6..f5b000075 100644 --- a/src/unitxt/catalog/tasks/ner/single_entity_type.json +++ b/src/unitxt/catalog/tasks/ner/single_entity_type.json @@ -10,7 +10,7 @@ "text": "str", "labels": "List[str]" }, - "prediction_type": "List[Tuple[str,str]]", + "prediction_type": "List[Tuple[str, str]]", "metrics": [ "metrics.ner" ], diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json index 53c15f40f..1cd21924d 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json @@ -5,7 +5,7 @@ "choices": "List[str]" }, "reference_fields": { - "answer": "Union[int,str]", + "answer": "Union[int, str]", "choices": "List[str]" }, "prediction_type": "str", diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json index 6bfc2541d..afafb1e25 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json @@ -7,7 +7,7 @@ "choices": "List[str]" }, "reference_fields": { - "answer": "Union[int,str]", + "answer": "Union[int, str]", "choices": "List[str]" }, "prediction_type": "str", diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json index bba0daef3..ae30b5c3b 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json @@ -8,7 +8,7 @@ "choices": "List[str]" }, "reference_fields": { - "answer": "Union[int,str]", + "answer": "Union[int, str]", "choices": "List[str]" }, "prediction_type": "str", diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json index 6a7d9b104..24e86e13a 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json @@ -6,7 +6,7 @@ "choices": "List[str]" }, "reference_fields": { - "answer": "Union[int,str]", + "answer": "Union[int, str]", "choices": "List[str]" }, "prediction_type": "str", diff --git a/src/unitxt/catalog/tasks/rag/corpora.json b/src/unitxt/catalog/tasks/rag/corpora.json new file mode 100644 index 000000000..c0d246473 --- /dev/null +++ b/src/unitxt/catalog/tasks/rag/corpora.json @@ -0,0 +1,14 @@ +{ + "__type__": "task", + "input_fields": { + "document_id": "str", + "title": "str", + "passages": "List[str]", + "metadata_field": "str" + }, + "reference_fields": {}, + "prediction_type": "Any", + "metrics": [ + "metrics.rouge" + ] +} diff --git a/src/unitxt/catalog/tasks/rag/end_to_end.json b/src/unitxt/catalog/tasks/rag/end_to_end.json new file mode 100644 index 000000000..8fe833d9a --- /dev/null +++ b/src/unitxt/catalog/tasks/rag/end_to_end.json @@ -0,0 +1,25 @@ +{ + "__type__": "task", + "input_fields": { + "question": "str", + "question_id": "Any", + "metadata_field": "str" + }, + "reference_fields": { + "reference_answers": "List[str]", + "reference_contexts": "List[str]", + "reference_context_ids": "List[str]", + "is_answerable_label": "bool" + }, + "metrics": [ + "metrics.rag.end_to_end.answer_correctness", + "metrics.rag.end_to_end.answer_faithfulness", + "metrics.rag.end_to_end.answer_reward", + "metrics.rag.end_to_end.context_correctness", + "metrics.rag.end_to_end.context_relevance" + ], + "prediction_type": "Dict[str, Any]", + "augmentable_inputs": [ + "question" + ] +} diff --git a/src/unitxt/catalog/tasks/rag/response_generation.json b/src/unitxt/catalog/tasks/rag/response_generation.json index 2a2fefee4..0d22c93f2 100644 --- a/src/unitxt/catalog/tasks/rag/response_generation.json +++ b/src/unitxt/catalog/tasks/rag/response_generation.json @@ -2,7 +2,7 @@ "__type__": "task", "input_fields": { "contexts": "List[str]", - "contexts_ids": "List[int]", + "contexts_ids": "Union[List[int], List[str]]", "question": "str" }, "reference_fields": { diff --git a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparative_rating/single_turn.json b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparative_rating/single_turn.json new file mode 100644 index 000000000..3e1790d4a --- /dev/null +++ b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparative_rating/single_turn.json @@ -0,0 +1,18 @@ +{ + "__type__": "task", + "input_fields": { + "question": "str", + "answer_a": "str", + "answer_b": "str", + "model_a": "str", + "model_b": "str" + }, + "reference_fields": { + "answer_a_preference": "int" + }, + "prediction_type": "int", + "metrics": [ + "metrics.weighted_win_rate_correlation", + "metrics.accuracy" + ] +} diff --git a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn.json b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn.json index a5d20dc10..6dfe76b7d 100644 --- a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn.json +++ b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn.json @@ -8,6 +8,8 @@ "winner": "str" }, "metrics": [ - "metrics.accuracy" + "metrics.accuracy", + "metrics.f1_micro", + "metrics.f1_macro" ] } diff --git a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.json b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.json index 6f59bdeea..c7c3f03c3 100644 --- a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.json +++ b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/multi_turn_with_reference.json @@ -9,6 +9,8 @@ "winner": "str" }, "metrics": [ - "metrics.accuracy" + "metrics.accuracy", + "metrics.f1_micro", + "metrics.f1_macro" ] } diff --git a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn.json b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn.json index ea2573d16..c72f0e0f2 100644 --- a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn.json +++ b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn.json @@ -9,6 +9,8 @@ "winner": "str" }, "metrics": [ - "metrics.accuracy" + "metrics.accuracy", + "metrics.f1_micro", + "metrics.f1_macro" ] } diff --git a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.json b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.json index ca8f04df9..b851badd8 100644 --- a/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.json +++ b/src/unitxt/catalog/tasks/response_assessment/pairwise_comparison/single_turn_with_reference.json @@ -10,6 +10,8 @@ "winner": "str" }, "metrics": [ - "metrics.accuracy" + "metrics.accuracy", + "metrics.f1_micro", + "metrics.f1_macro" ] } diff --git a/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn.json b/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn.json index 4da763cb2..3bce31b0d 100644 --- a/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn.json +++ b/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn.json @@ -8,5 +8,6 @@ }, "metrics": [ "metrics.spearman" - ] + ], + "prediction_type": "float" } diff --git a/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn_with_reference.json b/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn_with_reference.json index 082cb4414..1b34ef838 100644 --- a/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn_with_reference.json +++ b/src/unitxt/catalog/tasks/response_assessment/rating/multi_turn_with_reference.json @@ -9,5 +9,6 @@ }, "metrics": [ "metrics.spearman" - ] + ], + "prediction_type": "float" } diff --git a/src/unitxt/catalog/tasks/response_assessment/rating/single_turn.json b/src/unitxt/catalog/tasks/response_assessment/rating/single_turn.json index 4c496eeb5..6ef778e91 100644 --- a/src/unitxt/catalog/tasks/response_assessment/rating/single_turn.json +++ b/src/unitxt/catalog/tasks/response_assessment/rating/single_turn.json @@ -9,5 +9,6 @@ }, "metrics": [ "metrics.spearman" - ] + ], + "prediction_type": "float" } diff --git a/src/unitxt/catalog/tasks/response_assessment/rating/single_turn_with_reference.json b/src/unitxt/catalog/tasks/response_assessment/rating/single_turn_with_reference.json index 85d12c4be..9a690cbef 100644 --- a/src/unitxt/catalog/tasks/response_assessment/rating/single_turn_with_reference.json +++ b/src/unitxt/catalog/tasks/response_assessment/rating/single_turn_with_reference.json @@ -10,5 +10,6 @@ }, "metrics": [ "metrics.spearman" - ] + ], + "prediction_type": "float" } diff --git a/src/unitxt/catalog/tasks/span_labeling/extraction.json b/src/unitxt/catalog/tasks/span_labeling/extraction.json index e98cfc5ee..345e99c47 100644 --- a/src/unitxt/catalog/tasks/span_labeling/extraction.json +++ b/src/unitxt/catalog/tasks/span_labeling/extraction.json @@ -12,7 +12,7 @@ "spans_ends": "List[int]", "labels": "List[str]" }, - "prediction_type": "List[Tuple[str,str]]", + "prediction_type": "List[Tuple[str, str]]", "metrics": [ "metrics.ner" ], diff --git a/src/unitxt/catalog/tasks/targeted_sentiment_extraction/all_sentiment_classes.json b/src/unitxt/catalog/tasks/targeted_sentiment_extraction/all_sentiment_classes.json index 49556d6c5..c3adb4b97 100644 --- a/src/unitxt/catalog/tasks/targeted_sentiment_extraction/all_sentiment_classes.json +++ b/src/unitxt/catalog/tasks/targeted_sentiment_extraction/all_sentiment_classes.json @@ -10,7 +10,7 @@ "text": "List[str]", "labels": "List[str]" }, - "prediction_type": "List[Tuple[str,str]]", + "prediction_type": "List[Tuple[str, str]]", "metrics": [ "metrics.ner" ], diff --git a/src/unitxt/catalog/tasks/targeted_sentiment_extraction/single_sentiment_class.json b/src/unitxt/catalog/tasks/targeted_sentiment_extraction/single_sentiment_class.json index 58af81082..6e71af8c7 100644 --- a/src/unitxt/catalog/tasks/targeted_sentiment_extraction/single_sentiment_class.json +++ b/src/unitxt/catalog/tasks/targeted_sentiment_extraction/single_sentiment_class.json @@ -11,7 +11,7 @@ "text": "List[str]", "labels": "List[str]" }, - "prediction_type": "List[Tuple[str,str]]", + "prediction_type": "List[Tuple[str, str]]", "metrics": [ "metrics.ner" ], diff --git a/src/unitxt/catalog/templates/generation/from_pair/all.json b/src/unitxt/catalog/templates/generation/from_pair/all.json new file mode 100644 index 000000000..92be50cf8 --- /dev/null +++ b/src/unitxt/catalog/templates/generation/from_pair/all.json @@ -0,0 +1,6 @@ +{ + "__type__": "templates_list", + "items": [ + "templates.generation.from_pair.default" + ] +} diff --git a/src/unitxt/catalog/templates/generation/from_pair/default.json b/src/unitxt/catalog/templates/generation/from_pair/default.json new file mode 100644 index 000000000..2950c2082 --- /dev/null +++ b/src/unitxt/catalog/templates/generation/from_pair/default.json @@ -0,0 +1,9 @@ +{ + "__type__": "input_output_template", + "input_format": "Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}.\n{type_of_input_a}: \n{input_a} \n{type_of_input_b}: \n{input_b} \n{type_of_output}:", + "output_format": "{output}", + "postprocessors": [ + "processors.take_first_non_empty_line", + "processors.lower_case_till_punc" + ] +} diff --git a/src/unitxt/catalog/templates/rag/end_to_end/json_predictions.json b/src/unitxt/catalog/templates/rag/end_to_end/json_predictions.json new file mode 100644 index 000000000..29c61217f --- /dev/null +++ b/src/unitxt/catalog/templates/rag/end_to_end/json_predictions.json @@ -0,0 +1,8 @@ +{ + "__type__": "input_output_template", + "input_format": "", + "output_format": "{{\"answer\": \"{reference_answers}\", \"contexts\" : [\"{reference_contexts}\"], \"context_ids\" : [\"{reference_context_ids}\"]}}", + "postprocessors": [ + "processors.load_json_predictions" + ] +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard.json new file mode 100644 index 000000000..db557a6e2 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard.json @@ -0,0 +1,15 @@ +{ + "__type__": "pairwise_comparative_rating_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "choice_a_id_field": "model_a", + "choice_b_id_field": "model_b", + "answer_field": "answer_a_preference", + "shuffle": false, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".", + "input_format": "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_a}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_b}\n<|The End of Assistant B's Answer|>", + "postprocessors": [ + "processors.extract_arena_hard_numerical_judgment" + ], + "output_format": "{answer_a_preference}" +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard_with_shuffling.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard_with_shuffling.json new file mode 100644 index 000000000..26864d96e --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/arena_hard_with_shuffling.json @@ -0,0 +1,15 @@ +{ + "__type__": "pairwise_comparative_rating_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "choice_a_id_field": "model_a", + "choice_b_id_field": "model_b", + "answer_field": "answer_a_preference", + "shuffle": true, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".", + "input_format": "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_a}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_b}\n<|The End of Assistant B's Answer|>", + "postprocessors": [ + "processors.extract_arena_hard_numerical_judgment" + ], + "output_format": "{answer_a_preference}" +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.json new file mode 100644 index 000000000..28d284931 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard.json @@ -0,0 +1,15 @@ +{ + "__type__": "pairwise_comparative_rating_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "choice_a_id_field": "model_a", + "choice_b_id_field": "model_b", + "answer_field": "answer_a_preference", + "shuffle": false, + "instruction": "###Task Description:\n An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.\n1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.\n2. After writing a feedback, choose a better response between Response A and Response B. You should refer to the score rubric.\n3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (A or B)\"\n4. Please do not generate any other opening, closing, and explanations.\n\n###Instruction:\nPlease act as an impartial judge and evaluate the quality of the responses provided by two AI\n assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.", + "input_format": "\n{question}\n\n###Response A:\n{answer_a}\n\n###Response B:\n{answer_b}\n\n###Score Rubric:\n\nYou must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".", + "postprocessors": [ + "processors.extract_arena_hard_numerical_judgment" + ], + "output_format": "{answer_a_preference}" +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard_with_shuffling.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard_with_shuffling.json new file mode 100644 index 000000000..a96940f11 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparative_rating/prometheus_arena_hard_with_shuffling.json @@ -0,0 +1,15 @@ +{ + "__type__": "pairwise_comparative_rating_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "choice_a_id_field": "model_a", + "choice_b_id_field": "model_b", + "answer_field": "answer_a_preference", + "shuffle": true, + "instruction": "###Task Description:\n An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.\n1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.\n2. After writing a feedback, choose a better response between Response A and Response B. You should refer to the score rubric.\n3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (A or B)\"\n4. Please do not generate any other opening, closing, and explanations.\n\n###Instruction:\nPlease act as an impartial judge and evaluate the quality of the responses provided by two AI\n assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.", + "input_format": "\n{question}\n\n###Response A:\n{answer_a}\n\n###Response B:\n{answer_b}\n\n###Score Rubric:\n\nYou must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".", + "postprocessors": [ + "processors.extract_arena_hard_numerical_judgment" + ], + "output_format": "{answer_a_preference}" +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.json new file mode 100644 index 000000000..9561a16fe --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn.json @@ -0,0 +1,34 @@ +{ + "__type__": "dialog_pairwise_choice_template", + "dialog_fields": [ + { + "__type__": "dialog_fields_data", + "dialog_field": "dialog_a", + "assistant_role_label": "### Assistant A:", + "user_role_label": "### User:", + "system_role_label": "### System:" + }, + { + "__type__": "dialog_fields_data", + "dialog_field": "dialog_b", + "assistant_role_label": "### Assistant B:", + "user_role_label": "### User:", + "system_role_label": "### System:" + } + ], + "turns_separator": "\n\n", + "label_separator": "\n", + "choice_a_field": "dialog_a", + "choice_b_field": "dialog_b", + "answer_field": "winner", + "choice_a_label": "A", + "choice_b_label": "B", + "choice_tie_label": "C", + "shuffle": false, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n", + "input_format": "<|The Start of Assistant A's Conversation with User|>\n\n{dialog_a}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n{dialog_b}\n\n<|The End of Assistant B's Conversation with User|>", + "output_format": "[[{winner}]]", + "postprocessors": [ + "processors.extract_mt_bench_label_judgment" + ] +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.json new file mode 100644 index 000000000..bc724e849 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference.json @@ -0,0 +1,41 @@ +{ + "__type__": "dialog_pairwise_choice_template", + "dialog_fields": [ + { + "__type__": "dialog_fields_data", + "dialog_field": "reference_dialog", + "assistant_role_label": "### Reference answer:", + "user_role_label": "### User:", + "system_role_label": "### System:" + }, + { + "__type__": "dialog_fields_data", + "dialog_field": "dialog_a", + "assistant_role_label": "### Assistant A:", + "user_role_label": "### User:", + "system_role_label": "### System:" + }, + { + "__type__": "dialog_fields_data", + "dialog_field": "dialog_b", + "assistant_role_label": "### Assistant B:", + "user_role_label": "### User:", + "system_role_label": "### System:" + } + ], + "turns_separator": "\n\n", + "label_separator": "\n", + "choice_a_field": "dialog_a", + "choice_b_field": "dialog_b", + "answer_field": "winner", + "choice_a_label": "A", + "choice_b_label": "B", + "choice_tie_label": "C", + "shuffle": false, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n", + "input_format": "<|The Start of Reference Answer|>\n\n{reference_dialog}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n{dialog_a}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n{dialog_b}\n\n<|The End of Assistant B's Conversation with User|>", + "output_format": "[[{winner}]]", + "postprocessors": [ + "processors.extract_mt_bench_label_judgment" + ] +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffle.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffling.json similarity index 100% rename from src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffle.json rename to src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_reference_with_shuffling.json diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffle.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffling.json similarity index 100% rename from src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffle.json rename to src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_multi_turn_with_shuffling.json diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.json new file mode 100644 index 000000000..96c2aa2a6 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn.json @@ -0,0 +1,16 @@ +{ + "__type__": "pairwise_choice_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "answer_field": "winner", + "choice_a_label": "A", + "choice_b_label": "B", + "choice_tie_label": "C", + "shuffle": false, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n", + "input_format": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", + "output_format": "[[{winner}]]", + "postprocessors": [ + "processors.extract_mt_bench_label_judgment" + ] +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.json new file mode 100644 index 000000000..13f424b47 --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference.json @@ -0,0 +1,16 @@ +{ + "__type__": "pairwise_choice_template", + "choice_a_field": "answer_a", + "choice_b_field": "answer_b", + "answer_field": "winner", + "choice_a_label": "A", + "choice_b_label": "B", + "choice_tie_label": "C", + "shuffle": false, + "instruction": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.\n\n", + "input_format": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{reference_answer}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", + "output_format": "[[{winner}]]", + "postprocessors": [ + "processors.extract_mt_bench_label_judgment" + ] +} diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffle.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffling.json similarity index 100% rename from src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffle.json rename to src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_reference_with_shuffling.json diff --git a/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffle.json b/src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffling.json similarity index 100% rename from src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffle.json rename to src/unitxt/catalog/templates/response_assessment/pairwise_comparison/mt_bench_single_turn_with_shuffling.json diff --git a/src/unitxt/dataclass.py b/src/unitxt/dataclass.py index cf029d5eb..b67b93923 100644 --- a/src/unitxt/dataclass.py +++ b/src/unitxt/dataclass.py @@ -1,6 +1,7 @@ import copy import dataclasses import functools +import inspect import warnings from abc import ABCMeta from inspect import Parameter, Signature @@ -123,6 +124,17 @@ class UnexpectedArgumentError(TypeError): standard_variables = dir(object) +def is_class_method(func): + if inspect.ismethod(func): + return True + if inspect.isfunction(func): + sig = inspect.signature(func) + params = list(sig.parameters.values()) + if len(params) > 0 and params[0].name in ["self", "cls"]: + return True + return False + + def is_possible_field(field_name, field_value): """Check if a name-value pair can potentially represent a field. @@ -133,11 +145,11 @@ def is_possible_field(field_name, field_value): Returns: bool: True if the name-value pair can represent a field, False otherwise. """ - return ( - field_name not in standard_variables - and not field_name.startswith("__") - and not callable(field_value) - ) + if field_name in standard_variables: + return False + if is_class_method(field_value): + return False + return True def get_fields(cls, attrs): @@ -180,20 +192,21 @@ def get_fields(cls, attrs): } if field_name in attrs: - field = attrs[field_name] - if isinstance(field, Field): - args = {**dataclasses.asdict(field), **args} - elif isinstance(field, dataclasses.Field): + field_value = attrs[field_name] + if isinstance(field_value, Field): + args = {**dataclasses.asdict(field_value), **args} + elif isinstance(field_value, dataclasses.Field): args = { - "default": field.default, - "name": field.name, - "type": field.type, - "init": field.init, - "default_factory": field.default_factory, + "default": field_value.default, + "name": field_value.name, + "type": field_value.type, + "init": field_value.init, + "default_factory": field_value.default_factory, **args, } else: - args["default"] = field + args["default"] = field_value + args["default_factory"] = None else: args["default"] = dataclasses.MISSING args["default_factory"] = None @@ -413,6 +426,7 @@ def __init__(self, *argv, **kwargs): Checks for abstract fields when an instance is created. Warn when a deprecated is used """ + super().__init__() _init_fields = [field for field in fields(self) if field.init] _init_fields_names = [field.name for field in _init_fields] _init_positional_fields_names = [ diff --git a/src/unitxt/deprecation_utils.py b/src/unitxt/deprecation_utils.py index 300dafea8..2cfce92f4 100644 --- a/src/unitxt/deprecation_utils.py +++ b/src/unitxt/deprecation_utils.py @@ -74,12 +74,13 @@ def wrapper(*args, **kwargs): return wrapper -def deprecation(version, alternative=None): +def deprecation(version, alternative=None, msg=None): """Decorator for marking functions or class methods as deprecated. Args: version (str): The version at which the function or method becomes deprecated. alternative (str, optional): Suggested alternative to the deprecated functionality. + msg (str, optional): Additional message regarding the deprecation reason or alternatives. Returns: callable: A decorator that can be applied to functions or class methods. @@ -87,6 +88,7 @@ def deprecation(version, alternative=None): def decorator(obj): alt_text = f" Use {alternative} instead." if alternative is not None else "" + alt_text += msg if msg is not None else "" if callable(obj): func = obj elif hasattr(obj, "__init__"): diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py index 2b83422d2..fe888f5f4 100644 --- a/src/unitxt/formats.py +++ b/src/unitxt/formats.py @@ -59,10 +59,13 @@ class BaseFormat(Format): demos_field: str = "demos" @staticmethod - def _retrieve_field_and_pop_from_instance(instance, field_name) -> str: + def _retrieve_field_and_pop_from_instance( + instance, field_name, do_pop: bool = True + ) -> str: if field_name is not None and field_name in instance: field_value = instance[field_name] - instance.pop(field_name) + if do_pop: + instance.pop(field_name) assert ( field_value is not None ), f"Value in field '{field_name}' should not be none. Received instance: {instance}" @@ -165,10 +168,20 @@ def process( demos_string = "" for demo_instance in demo_instances: + demo_source = self._retrieve_field_and_pop_from_instance( + instance=demo_instance, field_name="source", do_pop=False + ) + demo_target = self._retrieve_field_and_pop_from_instance( + instance=demo_instance, field_name="target", do_pop=False + ) + demo_target_prefix = self._retrieve_field_and_pop_from_instance( + instance=demo_instance, field_name="target_prefix", do_pop=False + ) + demo_str = self.demo_format.format( - target_prefix=target_prefix, - source=demo_instance["source"], - target=demo_instance["target"], + target_prefix=demo_target_prefix, + source=demo_source, + target=demo_target, **self.format_args, ) demos_string += demo_str diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py index 5a7f11ad4..a332a4859 100644 --- a/src/unitxt/llm_as_judge.py +++ b/src/unitxt/llm_as_judge.py @@ -2,9 +2,12 @@ from .api import evaluate, produce from .artifact import Artifact, fetch_artifact, settings +from .formats import Format from .inference import InferenceEngine, OpenAiInferenceEngine from .metrics import BulkInstanceMetric from .operator import SequentialOperator +from .system_prompts import SystemPrompt +from .templates import Template class LLMAsJudge(BulkInstanceMetric): @@ -14,9 +17,9 @@ class LLMAsJudge(BulkInstanceMetric): main_score (str): The main score label used for evaluation. task (Literal["rating.single_turn"]): The type of task the llm-as-judge runs. This defines the output and input format of the jude model. - template (str): The template used when generating inputs for the judge llm. - format (str): The format used when generating inputs for judge llm. - system_prompt (str): The system prompt used when generating inputs for judge llm. + template (Template): The template used when generating inputs for the judge llm. + format (Format): The format used when generating inputs for judge llm. + system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm. strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt. inference_model (InferenceEngine): the module that creates the inference of the judge llm. @@ -25,14 +28,19 @@ class LLMAsJudge(BulkInstanceMetric): """ main_score: str = "llm_as_judge" - task: Literal["rating.single_turn", "single_turn_with_reference"] - template: str - format: Optional[str] = None - system_prompt: Optional[str] = None + task: Literal[ + "rating.single_turn", + "rating.single_turn_with_reference", + "pairwise_comparative_rating.single_turn", + ] + template: Template + format: Format = None + system_prompt: SystemPrompt = None strip_system_prompt_and_format_from_inputs: bool = True inference_model: InferenceEngine reduction_map: Optional[Dict[str, List[str]]] = None batch_size: int = 32 + prediction_type = Any # Because handled with multiple tasks def _get_input_instances(self, task_data: List[Dict]) -> List: if self.strip_system_prompt_and_format_from_inputs: @@ -43,7 +51,10 @@ def _get_input_instances(self, task_data: List[Dict]) -> List: instance = SequentialOperator( steps=[template, "formats.empty"] ).process_instance( - {"inputs": task_data_instance, "outputs": task_data_instance} + { + "input_fields": task_data_instance, + "reference_fields": task_data_instance, + } ) instances.append(instance["source"]) """ @@ -79,23 +90,67 @@ def _get_instance_for_judge_model( input_instances, predictions, references ) ] + elif self.task == "pairwise_comparative_rating.single_turn": + instances = [ + { + "question": input_instance, + "answer_a": prediction, + "answer_b": reference[0], + "model_a": "input_model", + "model_b": "baseline_model", + "answer_a_preference": 0, # This is a dummy value that is not used in practice, + } + for input_instance, prediction, reference in zip( + input_instances, predictions, references + ) + ] else: raise NotImplementedError( f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type." ) return instances + @staticmethod + def _add_metadata_to_judge_instances( + instances: List[List[Any]], task_data: List[Dict] + ): + for instance, data in zip(instances, task_data): + instance["data_classification_policy"] = data["metadata"][ + "data_classification_policy" + ] + def prepare(self): super().prepare() + if self.task == "pairwise_comparative_rating.single_turn": + self.reduction_map = {"weighted_win_rate": [self.main_score]} if self.reduction_map is None: self.reduction_map = {"mean": [self.main_score]} - supported_tasks = ["rating.single_turn", "rating.single_turn_with_reference"] + def verify(self): + supported_tasks = [ + "rating.single_turn", + "rating.single_turn_with_reference", + "pairwise_comparative_rating.single_turn", + ] assert self.task in supported_tasks, ( f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type." f"The supported tasks types are: {', '.join(supported_tasks)}." ) + if not isinstance(self.template, Template): + raise ValueError( + f"Provided template argument to 'LLMAsJudge' metric is not of type Template, but {type(self.template)}" + ) + if self.format and not isinstance(self.format, Format): + raise ValueError( + f"Provided format argument to 'LLMAsJudge' metric is not of type Format, but {type(self.format)}" + ) + + if self.system_prompt and not isinstance(self.system_prompt, SystemPrompt): + raise ValueError( + f"Provided system_prompt argument to 'LLMAsJudge' metric is not of type SystemPrompt, but {type(self.system_prompt)}" + ) + if isinstance(self.inference_model, OpenAiInferenceEngine): if self.format: raise ValueError( @@ -121,6 +176,7 @@ def compute( instances = self._get_instance_for_judge_model( input_instances, predictions, references ) + self._add_metadata_to_judge_instances(instances, task_data) card = f"cards.dynamic_cards_for_llm_judges.{self.task}" recipe_args = { @@ -138,10 +194,29 @@ def compute( dataset = produce(instances, recipe) verdicts = self.inference_model.infer(dataset) meta_scores = evaluate(predictions=verdicts, data=dataset) - return [ - { - self.main_score: instance["processed_prediction"], - "judge_raw_output": verdict, - } - for instance, verdict in zip(meta_scores, verdicts) - ] + + res_list = [] + for instance, verdict in zip(meta_scores, verdicts): + if self.task == "pairwise_comparative_rating.single_turn": + is_model_b_the_baseline = ( + instance["task_data"]["model_b"] == "baseline_model" + ) + if is_model_b_the_baseline: + model_a_preference_score = instance["processed_prediction"] + else: + model_a_preference_score = instance["processed_prediction"] * -1 + + res = { + self.main_score: model_a_preference_score, + "judge_raw_output": verdict, + "judge_raw_input": instance["source"], + } + else: + res = { + self.main_score: instance["processed_prediction"], + "judge_raw_output": verdict, + "judge_raw_input": instance["source"], + } + res_list.append(res) + + return res_list diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py index 9a0503a58..866c24091 100644 --- a/src/unitxt/loaders.py +++ b/src/unitxt/loaders.py @@ -858,7 +858,9 @@ def _get_file_list_from_wildcard_path( def _map_wildcard_path_to_full_paths(self): api = HfApi() - repo_files = api.list_repo_files(self.space_name, repo_type="space") + repo_files = api.list_repo_files( + self.space_name, repo_type="space", revision=self.revision + ) if isinstance(self.data_files, str): self.data_files = self._get_file_list_from_wildcard_path( self.data_files, repo_files diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 73fadeb2a..f283fbf1c 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -10,7 +10,7 @@ from dataclasses import field from operator import itemgetter from statistics import mean -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import evaluate import numpy @@ -19,13 +19,14 @@ from scipy.stats import bootstrap from scipy.stats._warnings_errors import DegenerateDataWarning -from .artifact import Artifact +from .artifact import Artifact, fetch_artifact from .dataclass import ( AbstractField, InternalField, NonPositionalField, OptionalField, ) +from .deprecation_utils import deprecation from .inference import HFPipelineBasedInferenceEngine, InferenceEngine from .logging_utils import get_logger from .metric_utils import InstanceInput, MetricRequest, MetricResponse @@ -40,14 +41,13 @@ from .random_utils import get_seed from .settings_utils import get_settings from .stream import MultiStream, Stream -from .type_utils import isoftype, parse_type_string +from .type_utils import Type, isoftype, parse_type_string, to_type_string logger = get_logger() settings = get_settings() warnings.filterwarnings("ignore", category=DegenerateDataWarning) - warnings.filterwarnings("ignore", category=DegenerateDataWarning) @@ -89,28 +89,51 @@ def process( return instance +@deprecation( + version="2.0.0", + msg="use regular type instead of strings (e.g Dict[str] instead of 'Dict[str]')", +) +def parse_string_types_instead_of_actual_objects(obj): + return parse_type_string(obj) + + class Metric(Artifact): main_score: str = AbstractField() # Override 'prediction_type' with the expected type of predictions # and references. Example: "List[str]", "List[Dict]"", "string". # If left with default None, a warning will be displayed. # In future versions of unitxt, this will be an error. - prediction_type: str = None + prediction_type: Union[Type, str] = Any # Standard metrics can receive multiple references per predictions (in a list) # Some metrics support only a single reference per prediction (one element in the list) single_reference_per_prediction: bool = False - # Used to store the parsed prediction type and avoid - # parsing on every use - _parsed_prediction_type = None - # # Used to add a prefix to all score, except the "score_name" and "score" fields. # This is used to distinguish two scores of the same metrics, operating on different fields of the task # score_prefix: str = "" + def prepare(self): + super().prepare() + if isinstance(self.prediction_type, str): + self.prediction_type = parse_string_types_instead_of_actual_objects( + self.prediction_type + ) + + @classmethod + def process_data_after_load(cls, data): + if "prediction_type" in data: + data["prediction_type"] = parse_type_string(data["prediction_type"]) + return data + + def process_data_before_dump(self, data): + if "prediction_type" in data: + if not isinstance(data["prediction_type"], str): + data["prediction_type"] = to_type_string(data["prediction_type"]) + return data + def _add_score_prefix(self, score_name): return ( self.score_prefix + score_name @@ -151,9 +174,9 @@ def _validate_references_and_prediction(self, references, predictions): self._validate_prediction(prediction) def _validate_prediction(self, prediction): - if not isoftype(prediction, self.get_prediction_type()): + if not isoftype(prediction, self.prediction_type): raise ValueError( - f"Each prediction is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}" + f"Each prediction is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}" ) def _validate_reference(self, reference): @@ -166,28 +189,11 @@ def _validate_reference(self, reference): f"Expecting a list with a single reference per prediction in {self.get_metric_name()} metric. Received a list with multiple references: {reference}" ) for ref in reference: - if not isoftype(ref, self.get_prediction_type()): + if not isoftype(ref, self.prediction_type): raise ValueError( - f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}" + f"Each reference is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}" ) - def get_prediction_type(self): - if self.prediction_type is None: - logger.warning( - f"{self.get_metric_name()} metric does not set the 'prediction_type' parameter so input type checking is not performed. Set the prediction type to the expected prediction type (e.g. 'str', 'List[str]', or 'Any'). In future version of unitxt this will raise an exception." - ) - self._parsed_prediction_type = Any - try: - if self._parsed_prediction_type is not None: - return self._parsed_prediction_type - - self._parsed_prediction_type = parse_type_string(self.prediction_type) - except ValueError: - raise ValueError( - f"Could convert prediction type '{self.prediction_type}' in {self.get_metric_name()} to known type. To enable type checking for this prediction type, open unitxt issue with this message. Alternatively, set the metric's prediction_type to 'Any'" - ) from None - return self._parsed_prediction_type - def get_metric_name(self): if self.__id__ is not None: return self.__id__ @@ -232,6 +238,38 @@ def set_global_score(instances, global_score: Dict[str, Any]): def disable_confidence_interval_calculation(self): pass + # update instance["score"]["global"] with the newly computed global score, global_score, for the + # current metric computed. global_score contains "score" and "score_name" fields that reflect + # (the main_score of) the current metric. + # A simple python-dictionary-update adds new fields to instance["score"]["global"], and also replaces the values + # of its fields "score" and "score_name", to reflect the current metric, overwriting previous metrics' settings + # of these fields (if any previous metric exists). + # When global_score does NOT contain ci score (because CI was not computed for the current metric), but + # one of the previous metrics computed did have, the last of such previous metrics set the values in + # fields "score_ci_low" and "score_ci_high" in instance["score"]["global"] to reflect its + # (the previous metric's) CI scores. + # Because CI is not computed for the current metric, global_score does not contain fields "score_ci_low" and + # "score_ci_high" to overwrite the ones existing in instance["score"]["global"], and these might remain in + # instance["score"]["global"], but their values, that are not associated with the current metric, are, + # therefore, not consistent with "score_name". + # In such a case, following the python-dictionary-update, we pop out fields "score_ci_low" and + # "score_ci_high" from instance["score"]["global"], so that now all the fields "score.." in + # instance["score"]["global"] are consistent with the current metric: The current metric + # is named instance["score"]["global"]["score_name"], its score shows in + # field instance["score"]["global"]["score"], and it does not have ci_scores, + # which is also reflected in the absence of fields "score_ci_low" and "score_ci_high" from instance["score"]["global"]. + # If ci IS computed for the current metric, global_score contains "score_ci_low" and "score_ci_high", and these overwrite + # the ones existing in instance["score"]["global"] by a simple python-dictionary-update, and no need for any further fixeup. + def update_and_adjust_global_score( + self, instance: Dict[str, Any], global_score: dict + ): + instance["score"]["global"].update(global_score) + for score_ci in ["score_ci_low", "score_ci_high"]: + if score_ci in global_score: + continue + if score_ci in instance["score"]["global"]: + instance["score"]["global"].pop(score_ci) + class MetricWithConfidenceInterval(Metric): # The number of resamples used to estimate the confidence intervals of this metric. @@ -327,6 +365,7 @@ def score_based_confidence_interval( # otherwise, the aggregation_func needs to be applied AFTER resampling the instances; # that is, re-form the groups, calculate the function, and take the mean of the group scores aggregation_func = self.average_item_scores + for score_name in score_names: # If all computed instance level scores are the same, there is no point in computing # confidence intervals. So skip to the next score. @@ -525,7 +564,6 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato self._validate_references_and_prediction(references, predictions) result = self._compute(references, predictions, task_data) - global_score.update(self._add_score_prefixes_to_score_dict(result)) score_name = global_score["score_name"] confidence_interval = self.compute_global_confidence_intervals( @@ -534,7 +572,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato global_score.update(confidence_interval) for instance in instances: - instance["score"]["global"].update(global_score) + self.update_and_adjust_global_score(instance, global_score) yield instance def _compute( @@ -576,7 +614,9 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval): reduction_map: Dict[str, List[str]] - implemented_reductions: List[str] = field(default_factory=lambda: ["mean"]) + implemented_reductions: List[str] = field( + default_factory=lambda: ["mean", "weighted_win_rate"] + ) def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator: global_score = {} @@ -651,9 +691,29 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato instances=instances, score_names=ci_fields_with_prefix ) global_score.update(confidence_interval) + if reduction == "weighted_win_rate": + for field_name in fields: + field_name_with_prefix = self._add_score_prefix(field_name) + total_battles = 0 + wins = 0 + for instance in instances: + s = instance["score"]["instance"][field_name_with_prefix] + if s > 0: + total_battles += s + wins += s + elif s < 0: + total_battles += abs(s) + else: + total_battles += 2 + wins += 1 + + global_score[field_name_with_prefix] = wins / total_battles + if field_name == self.main_score: + global_score["score"] = global_score[field_name_with_prefix] + global_score["score_name"] = self.score_prefix + self.main_score for instance in instances: - instance["score"]["global"].update(global_score) + self.update_and_adjust_global_score(instance, global_score) yield instance @abstractmethod @@ -666,6 +726,179 @@ def compute( pass +class WeightedWinRateCorrelation(GlobalMetric): + main_score = "spearman_corr" + average = None # Report per class then aggregate by mean + metric = "weighted_win_rate_correlation" + + @staticmethod + def _update_battles_dataframe( + df: pd.DataFrame, + model_a: str, + model_b: str, + model_a_wins: int, + model_b_wins: int, + ): + import pandas as pd + + # Sort the model tuple alphabetically + if model_b < model_a: + temp = model_a + model_a = model_b + model_b = temp + temp = model_a_wins + model_a_wins = model_b_wins + model_b_wins = temp + + # Check if a row with these models already exists + row = df[(df["model_a"] == model_a) & (df["model_b"] == model_b)] + + if not row.empty: + # Update the existing row + index = row.index[0] + df.at[index, "model_a_win_count"] += model_a_wins + df.at[index, "model_b_win_count"] += model_b_wins + df.at[index, "total_battles"] += model_a_wins + model_b_wins + else: + # Add a new row + new_row = { + "model_a": model_a, + "model_b": model_b, + "model_a_win_count": model_a_wins, + "model_b_win_count": model_b_wins, + "total_battles": model_a_wins + model_b_wins, + } + df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) + + return df + + @staticmethod + def _get_win_rate_df(df: pd.DataFrame): + # Step 1: Aggregate wins for each model + # Create separate DataFrames for wins and battles + df_wins_a = df[["model_a", "model_a_win_count"]].rename( + columns={"model_a": "model", "model_a_win_count": "wins"} + ) + df_wins_b = df[["model_b", "model_b_win_count"]].rename( + columns={"model_b": "model", "model_b_win_count": "wins"} + ) + df_wins = pd.concat([df_wins_a, df_wins_b]) + + # Aggregate total wins for each model + total_wins = df_wins.groupby("model").sum().reset_index() + + # Step 2: Calculate total battles for each model + # Count appearances in model_a and model_b + battles_a = df[["model_a", "total_battles"]].rename( + columns={"model_a": "model"} + ) + battles_b = df[["model_b", "total_battles"]].rename( + columns={"model_b": "model"} + ) + battles = pd.concat([battles_a, battles_b]) + + # Aggregate total battles for each model + total_battles = battles.groupby("model").sum().reset_index() + + # Step 3: Merge and compute win rate + win_rates = total_wins.merge(total_battles, on="model") + win_rates["win_rate"] = win_rates["wins"] / win_rates["total_battles"] + return win_rates + + def compute( + self, + references: List[List[Any]], + predictions: List[Any], + task_data: List[Any], + ) -> dict: + import pandas as pd + + """Computes a scores dictionary on a list of references, predictions and input. + + This function is called once per instance, and then another time + over all data instances. + + Returns: + a dictionary of scores that is set as: + the instance scores when called on a single data instance + the global score when called on the all data instances + """ + if len(predictions) == 1: + prediction = predictions[0] + gold_ref = references[0][0] + return {"loss": abs(prediction - gold_ref)} + + pred_df = pd.DataFrame( + columns=[ + "model_a", + "model_b", + "model_a_win_count", + "model_b_win_count", + "total_battles", + ] + ) + ref_df = pd.DataFrame( + columns=[ + "model_a", + "model_b", + "model_a_win_count", + "model_b_win_count", + "total_battles", + ] + ) + + for instance_task_data, prediction, gold_ref in zip( + task_data, predictions, references + ): + gold_ref = int(gold_ref[0]) + model_a = instance_task_data["model_a"] + model_b = instance_task_data["model_b"] + if prediction > 0: + model_a_wins = prediction + model_b_wins = 0 + elif prediction < 0: + model_a_wins = 0 + model_b_wins = -1 * prediction + else: + model_a_wins = 1 + model_b_wins = 1 + + pred_df = self._update_battles_dataframe( + pred_df, model_a, model_b, model_a_wins, model_b_wins + ) + + if gold_ref > 0: + model_a_wins = gold_ref + model_b_wins = 0 + elif gold_ref < 0: + model_a_wins = 0 + model_b_wins = -1 * gold_ref + else: + model_a_wins = 1 + model_b_wins = 1 + + ref_df = self._update_battles_dataframe( + ref_df, model_a, model_b, model_a_wins, model_b_wins + ) + + pred_df_win_rate = self._get_win_rate_df(pred_df) + ref_df_win_rate = self._get_win_rate_df(ref_df) + + from scipy.stats import pearsonr, spearmanr + + merged_df = pd.merge( + pred_df_win_rate, ref_df_win_rate, on="model", suffixes=("_pred", "_ref") + ) + pearson_corr, _ = pearsonr( + merged_df["win_rate_pred"], merged_df["win_rate_ref"] + ) + spearman_corr, _ = spearmanr( + merged_df["win_rate_pred"], merged_df["win_rate_ref"] + ) + + return {"pearson_corr": pearson_corr, "spearman_corr": spearman_corr} + + class InstanceMetric(StreamOperator, MetricWithConfidenceInterval): """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs). @@ -870,7 +1103,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato global_score.update(confidence_interval) for instance in instances: - instance["score"]["global"].update(global_score) + self.update_and_adjust_global_score(instance, global_score) yield from instances def compute_instance_scores( @@ -1018,7 +1251,7 @@ class Accuracy(InstanceMetric): main_score = "accuracy" ci_scores = ["accuracy"] - prediction_type = "Any" # string representation is compared + prediction_type = Any # string representation is compared def compute( self, references: List[Any], prediction: Any, task_data: List[Dict] @@ -1038,7 +1271,7 @@ class JaccardIndex(InstanceMetric): main_score = "jaccard_index" ci_scores = ["jaccard_index"] - prediction_type = "Any" # string representation is compared + prediction_type = Any # string representation is compared def compute( self, references: List[Any], prediction: Any, task_data: List[Dict] @@ -1092,7 +1325,7 @@ class StringContainment(InstanceMetric): main_score = "string_containment" ci_scores = ["string_containment"] - prediction_type = "Any" # string representation is compared + prediction_type = Any # string representation is compared single_reference_per_prediction = False # multiple references allowed def compute( @@ -1120,6 +1353,7 @@ def disable_confidence_interval_calculation(self): self.metric.disable_confidence_interval_calculation() def verify(self): + super().verify() assert ( self.metric is not None ), f"'metric' is not set in {self.get_metric_name()}" @@ -1300,13 +1534,89 @@ def compute( return results +class HuggingfaceInstanceMetric(InstanceMetric): + hf_metric_name: str + + hf_metric_fields: List[str] + hf_compute_args: dict = {} + + def prepare(self): + super().prepare() + self.metric = evaluate.load( + self.hf_metric_name, experiment_id=str(uuid.uuid4()) + ) + + def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict: + # invokes module.compute, which invokes, e.g., meteor's _compute + + try: + score = self.metric.compute( + predictions=[prediction], + references=[references], + **self.hf_compute_args, + ) + except: + score = {self.main_score: np.nan} + + if self.hf_metric_fields is not None and len(self.hf_metric_fields) > 0: + to_ret = {field: score[field] for field in self.hf_metric_fields} + score = to_ret + + return score + + +class Meteor(InstanceMetric): + main_score = "meteor" + ci_scores = ["meteor"] + reduction_map = {"mean": ["meteor"]} + prediction_type = str + + _requirements_list: List[str] = ["nltk"] + alpha: float = 0.9 + beta: int = 3 + gamma: float = 0.5 + # unitxt uses nltk version >= 3.8 + + def prepare(self): + super().prepare() + import nltk + + nltk.download("wordnet", quiet=True) + nltk.download("omw-1.4", quiet=True) + from nltk import word_tokenize + from nltk.translate import meteor_score + + self.word_tokenize = word_tokenize + self.meteor_score = meteor_score + + def verify(self): + import importlib.metadata as importlib_metadata + + from datasets.config import version + + nltk_version = version.parse(importlib_metadata.version("nltk")) + assert nltk_version >= version.Version( + "3.6.6" + ), "nltk version must be at least 3.6.6" + + def compute(self, references, prediction, task_data): + score = self.meteor_score.meteor_score( + [self.word_tokenize(ref) for ref in references], + self.word_tokenize(prediction), + alpha=self.alpha, + beta=self.beta, + gamma=self.gamma, + ) + return {"meteor": score} + + class F1(GlobalMetric): _metric = None main_score = "f1_macro" average = None # Report per class then aggregate by mean metric = "f1" - prediction_type = "str" + prediction_type = str single_reference_per_prediction = True def prepare(self): @@ -1366,7 +1676,7 @@ class F1Binary(GlobalMetric): main_score = "f1_binary" average = None threshold = 0.5 - prediction_type = "Union[float, int]" + prediction_type = Union[float, int] _metric = None metric = "f1" single_reference_per_prediction = True @@ -1425,12 +1735,12 @@ class FinQAEval(InstanceMetric): reduction_map = {"mean": ["program_accuracy", "execution_accuracy"]} main_score = "program_accuracy" ci_scores = ["program_accuracy", "execution_accuracy"] - prediction_type = "str" + prediction_type = str finqa_module = "" def finqa_eval_program( self, references: List[List], prediction: str, task_data: Dict, finqa_module - ) -> (float, float): + ) -> Tuple[float, float]: prog_correct = False pred_item = finqa_module.program_tokenization(prediction) program = task_data["program_re"] @@ -1442,7 +1752,7 @@ def finqa_eval_program( def finqa_eval_execution( self, references: List[List], prediction: str, task_data: Dict, finqa_module - ) -> (float, float): + ) -> Tuple[float, float]: exe_correct = False last_char = prediction.rfind(")") prediction = prediction[: last_char + 1] @@ -1582,7 +1892,7 @@ class F1MultiLabel(GlobalMetric): average = None # Report per class then aggregate by mean metric = "f1" - prediction_type = "List[str]" + prediction_type = List[str] single_reference_per_prediction = True def prepare(self): @@ -1691,16 +2001,61 @@ class F1MacroMultiLabel(F1MultiLabel): average = None -class Rouge(HuggingfaceMetric): +class Rouge(InstanceMetric): + main_score = "rougeL" + prediction_type = str + single_reference_per_prediction = False # multiple references allowed + rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + reduction_map = {"mean": ["rouge1", "rouge2", "rougeL", "rougeLsum"]} + ci_scores = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + + sent_split_newline: bool = True + _requirements_list: List[str] = ["nltk", "rouge_score"] + + def prepare(self): + super().prepare() + import nltk + from rouge_score import rouge_scorer + + self.rouge_scorer = rouge_scorer + + nltk.download("punkt", quiet=True) + self.sent_tokenize = nltk.sent_tokenize + + def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict: + # for a single instance, prediction is of type str, and references: list of str + if self.sent_split_newline: + prediction = "\n".join(self.sent_tokenize(prediction.strip())) + + references = [ + "\n".join(self.sent_tokenize(reference.strip())) + for reference in references + ] + + # the following is taken from HF rouge, using the defaults: + # use_aggregator=True, use_stemmer=False, tokenizer=None + scorer = self.rouge_scorer.RougeScorer( + rouge_types=self.rouge_types, use_stemmer=False, tokenizer=None + ) + # with Unitxt, references is a list + score = scorer.score_multi(references, prediction) + for key in score: + score[key] = score[key].fmeasure + return score + + +class RougeHF(HuggingfaceInstanceMetric): hf_metric_name = "rouge" main_score = "rougeL" scale = 1.0 - prediction_type = "str" + prediction_type = str single_reference_per_prediction = False # multiple references allowed - use_aggregator: bool = True rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + reduction_map = {"mean": ["rouge1", "rouge2", "rougeL", "rougeLsum"]} + hf_metric_fields = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + ci_scores = ["rouge1", "rouge2", "rougeL", "rougeLsum"] sent_split_newline: bool = True @@ -1709,26 +2064,33 @@ class Rouge(HuggingfaceMetric): def prepare(self): super().prepare() + # We don't use the aggregation, to avoid running bootstrapping by the + # internal library (which is costly) and done by Unitxt in any case. self.hf_compute_args.update( - {"use_aggregator": self.use_aggregator, "rouge_types": self.rouge_types} + {"use_aggregator": False, "rouge_types": self.rouge_types} ) import nltk - nltk.download("punkt") + nltk.download("punkt", quiet=True) self.sent_tokenize = nltk.sent_tokenize - def compute(self, references, predictions, task_data: List[Dict]): + def compute(self, references, prediction, task_data: List[Dict]): + # for a single instance, prediction is of type str, and references: list of str if self.sent_split_newline: - predictions = [ - "\n".join(self.sent_tokenize(prediction.strip())) - for prediction in predictions - ] + prediction = "\n".join(self.sent_tokenize(prediction.strip())) + references = [ - ["\n".join(self.sent_tokenize(r.strip())) for r in reference] + "\n".join(self.sent_tokenize(reference.strip())) for reference in references ] - return super().compute(references, predictions, task_data) + + hf_score = super().compute(references, prediction, task_data) + for metric_field in self.hf_metric_fields: + if isinstance(hf_score[metric_field], list): + assert len(hf_score[metric_field]) == 1 + hf_score[metric_field] = hf_score[metric_field][0] + return hf_score # Computes char edit distance, ignoring whitespace @@ -1736,7 +2098,7 @@ class CharEditDistance(InstanceMetric): main_score = "char_edit_distance" reduction_map = {"mean": [main_score]} ci_scores = [main_score] - prediction_type = "str" + prediction_type = str single_reference_per_prediction = True accuracy_metric = False @@ -1774,7 +2136,7 @@ class CharEditDistanceAccuracy(CharEditDistance): class Wer(HuggingfaceMetric): hf_metric_name = "wer" main_score = "wer" - prediction_type = "str" + prediction_type = str single_reference_per_prediction = True _requirements_list: List[str] = ["jiwer"] @@ -1796,13 +2158,13 @@ class Spearmanr(HuggingfaceMetric): hf_metric_name = "spearmanr" main_score = "spearmanr" process_single_instances = False - prediction_type = "float" + prediction_type = float # Spearmanr references are not list def _validate_reference(self, reference): - if not isoftype(reference, self.get_prediction_type()): + if not isoftype(reference, self.prediction_type): raise ValueError( - f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}" + f"Each reference is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}" ) @@ -1810,7 +2172,7 @@ class KendallTauMetric(GlobalMetric): main_score = "kendalltau_b" variant = "b" process_single_instances = False - prediction_type = "float" + prediction_type = float _requirements_list: List[str] = ["scipy"] @@ -1842,7 +2204,7 @@ class MatthewsCorrelation(HuggingfaceMetric): str_to_id: dict = InternalField(default_factory=dict) single_reference_per_prediction = True - prediction_type = "str" + prediction_type = str def get_str_id(self, str): if str not in self.str_to_id: @@ -1872,7 +2234,7 @@ class RocAuc(GlobalMetric): process_single_instances = False _requirements_list: List[str] = ["sklearn"] single_reference_per_prediction = True - prediction_type = "float" + prediction_type = float def prepare(self): from sklearn import metrics @@ -1898,7 +2260,7 @@ def compute( class CustomF1(GlobalMetric): main_score = "f1_micro" - prediction_type = "Any" + prediction_type = Any single_reference_per_prediction = True groups = None zero_division: float = 0.0 @@ -2077,7 +2439,7 @@ def add_macro_scores(self, f1_result, recall_result, precision_result, result): class NER(CustomF1): - prediction_type = "List[Tuple[str,str]]" + prediction_type = List[Tuple[str, str]] def get_element_group(self, element, additional_input): return element[1] @@ -2110,7 +2472,7 @@ class TokenOverlap(InstanceMetric): main_score = "f1" ci_scores = ["f1", "precision", "recall"] single_reference_per_prediction = False - prediction_type = "str" + prediction_type = str def compute( self, references: List[Any], prediction: Any, task_data: List[Dict] @@ -2149,7 +2511,7 @@ class BertScore(HuggingfaceBulkMetric): model_name: str model_layer: int = None - prediction_type = "str" + prediction_type = str _requirements_list: List[str] = ["bert_score"] @@ -2218,7 +2580,7 @@ class Reward(BulkInstanceMetric): model_name: str - prediction_type = "str" + prediction_type = str single_reference_per_prediction = True _requirements_list: List[str] = ["transformers", "torch"] @@ -2257,7 +2619,7 @@ class Detector(BulkInstanceMetric): main_score = "score" batch_size: int = 32 - prediction_type = "str" + prediction_type = str model_name: str @@ -2291,7 +2653,7 @@ class RegardMetric(GlobalMetric): # Regard passes task data in the legacy way using references # instead of using the 'task_data' parameters, so prediction # type and reference type are different - prediction_type = "Any" + prediction_type = Any _requirements_list: List[str] = ["transformers", "torch", "tqdm"] @@ -2404,7 +2766,7 @@ class SafetyMetric(GlobalMetric): # Safety passes task data in the legacy way using references # instead of using the 'task_data' parameters, so prediction # type and reference type are different - prediction_type = "Any" + prediction_type = Any batch_size: int = 100 critical_threshold: int = -5 # _CRITICAL_THRESHOLD = -5 high_threshold: int = -4 # _HIGH_THRESHOLD = -4 @@ -2503,7 +2865,7 @@ def compute(self, references, predictions, task_data): class LlamaIndexLLMMetric(InstanceMetric): model_name: str = "" main_score: str = "" - prediction_type: str = "str" + prediction_type: str = str reduction_map: Dict[str, List[str]] = None openai_models: List[str] = ["gpt-3.5-turbo"] anthropic_models: List[ @@ -2650,7 +3012,7 @@ class Perplexity(BulkInstanceMetric): main_score = "perplexity" reduction_map = {"mean": ["perplexity"]} - prediction_type = "str" + prediction_type = str source_template: str target_template: str @@ -2924,14 +3286,14 @@ class Squad(HuggingfaceMetric): main_score = "f1" scale = 100.0 scaled_fields = ["f1", "exact_match"] - prediction_type = "Dict[str,Any]" + prediction_type = Dict[str, Any] # Squad references are not list, but a dict that contain a field called 'answers/text' # which is the list of references def _validate_reference(self, reference): - if not isoftype(reference, self.get_prediction_type()): + if not isoftype(reference, self.prediction_type): raise ValueError( - f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}" + f"Each reference is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}" ) @@ -2954,7 +3316,7 @@ class NDCG(GlobalMetric): _requirements_list: List[str] = ["sklearn"] single_reference_per_prediction = True - prediction_type = "Optional[float]" + prediction_type = Optional[float] def prepare(self): from sklearn.metrics import ndcg_score @@ -3002,7 +3364,7 @@ def compute( class RetrievalMetric(InstanceMetric): - prediction_type = "List[str]" + prediction_type = List[str] single_reference_per_prediction = True def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict: @@ -3156,7 +3518,7 @@ def _compute( class KPA(CustomF1): - prediction_type = "str" + prediction_type = str single_reference_per_prediction = True def get_element_group(self, element, additional_input): @@ -3895,7 +4257,7 @@ class BinaryAccuracy(InstanceMetric): ci_scores = ["accuracy_binary"] threshold = 0.5 - prediction_type = "Union[float,int]" + prediction_type = Union[float, int] single_reference_per_prediction = True def _validate_reference(self, reference): @@ -3922,7 +4284,7 @@ class BinaryMaxAccuracy(GlobalMetric): process_single_instances = False main_score = "max_accuracy_binary" - prediction_type = "Union[float,int]" + prediction_type = Union[float, int] single_reference_per_prediction = True def compute( @@ -4091,7 +4453,7 @@ def compute( class NormalizedSacrebleu(HuggingfaceMetric): hf_metric_name = "sacrebleu" hf_main_score = "score" - prediction_type = "str" + prediction_type = str main_score = "sacrebleu" scale = 100.0 scaled_fields = ["sacrebleu", "precisions"] @@ -4129,7 +4491,7 @@ def calculate_groups_ratio(self, actual_group, total_group): class FuzzyNer(CustomF1Fuzzy): - prediction_type = "List[Tuple[str,str]]" + prediction_type = List[Tuple[str, str]] fuzz_ratio = 75 def get_element_group(self, element, additional_input): @@ -4157,7 +4519,7 @@ class IsCodeMixed(BulkInstanceMetric): main_score = "is_code_mixed" reduction_map = {"mean": [main_score]} - prediction_type = "str" + prediction_type = str inference_model: InferenceEngine = None @@ -4201,3 +4563,61 @@ def _prepare_instances_for_model(self, texts: List[str]): ) processed_stream = self.processor.process(stream) return processed_stream.to_dataset()["test"] + + +class MetricsEnsemble(InstanceMetric): + """Metrics Ensemble class for creating ensemble of given metrics. + + Attributes: + main_score (str): The main score label used for evaluation. + metrics (List[Union[Metric, str]]): List of metrics that will be ensemble. + weights (List[float]): Weight of each the metrics + InstanceMetric currently allows two reductions: + reduction_map (Dict[str, List[str]]. Parameter for specifying the redaction method of the global score. + (see it definition at InstanceMetric class). This class define its default + value to reduce by the mean of the main score. + + """ + + main_score = "ensemble_score" + reduction_map = {"mean": [main_score]} + metrics: List[Union[Metric, str]] + weights: List[float] = None + + def get_prefix_name(self, i): + return f"ensemble_{i}_" + + def prepare(self): + super().prepare() + self.metrics = [fetch_artifact(metric)[0] for metric in self.metrics] + for i, metric in enumerate(self.metrics): + metric.score_prefix = self.get_prefix_name(i) + if self.weights is None: + self.weights = [1 / len(self.metrics) for _ in range(len(self.metrics))] + + def create_ensemble_scores(self, instance): + score = self.ensemble(instance) + instance[ + "prediction" + ] = score # We use here the prediction field to pass the score to the compute method. + return instance + + def ensemble(self, instance): + score = 0 + for i, (metric, weight) in enumerate(zip(self.metrics, self.weights)): + score += ( + instance["score"]["instance"][ + self.get_prefix_name(i) + metric.main_score + ] + * weight + ) + return score + + def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator: + for metric in self.metrics: + stream = list(metric.process(stream=stream, stream_name=stream_name)) + stream = [self.create_ensemble_scores(g) for g in stream] + return super().process(stream=stream, stream_name=stream_name) + + def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict: + return {self.main_score: prediction} diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py index 7f996091f..034f47a10 100644 --- a/src/unitxt/operators.py +++ b/src/unitxt/operators.py @@ -303,6 +303,10 @@ class SelectFields(InstanceOperator): fields: List[str] + def prepare(self): + super().prepare() + self.fields.extend(["data_classification_policy", "recipe_metadata"]) + def process( self, instance: Dict[str, Any], stream_name: Optional[str] = None ) -> Dict[str, Any]: diff --git a/src/unitxt/parsing_utils.py b/src/unitxt/parsing_utils.py index 0496ba8d7..d9bcc028b 100644 --- a/src/unitxt/parsing_utils.py +++ b/src/unitxt/parsing_utils.py @@ -55,6 +55,8 @@ def consume_name_val(instring: str) -> Tuple[Any, str]: return (True, instring) if name_val == "False": return (False, instring) + if name_val == "None": + return (None, instring) sign = 1 if name_val.startswith("-"): @@ -135,7 +137,7 @@ def consume_assignment(instring: str) -> Tuple[Any, str]: if not instring.startswith("="): raise ValueError(f"malformed assignment in: {orig_instring}") (term, instring) = consume_term(instring[1:].strip()) - if (term is None) or not (isinstance(term, (int, float, bool)) or len(term) > 0): + if not ((term is None) or isinstance(term, (int, float, bool)) or (len(term) > 0)): raise ValueError(f"malformed assigned value in: {orig_instring}") return ({name: term}, instring) diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index 3d9c3e385..45de6bce2 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -258,3 +258,22 @@ def process_value(self, text: Any) -> Any: if first_line == "safe": return 1.0 return 0.0 + + +class ExtractArenaHardNumericalJudgment(FieldOperator): + def process_value(self, text: Any) -> Any: + match = re.search(r"\[\[([^\]]+)\]\]", text) + try: + res = str(match.group(1)) + if res == "A>B": + return 1 + if res == "A>>B": + return 3 + if res == "B>A": + return -1 + if res == "B>>A": + return -3 + return 0 + + except: + return 0 diff --git a/src/unitxt/schema.py b/src/unitxt/schema.py index cf4058fe3..1d2249620 100644 --- a/src/unitxt/schema.py +++ b/src/unitxt/schema.py @@ -39,9 +39,10 @@ def process( **instance["input_fields"], **instance["reference_fields"], "metadata": { + "data_classification_policy": instance["data_classification_policy"], "template": self.artifact_to_jsonable( instance["recipe_metadata"]["template"] - ) + ), }, } instance["task_data"] = json.dumps(task_data) diff --git a/src/unitxt/splitters.py b/src/unitxt/splitters.py index f181d147c..cb9816342 100644 --- a/src/unitxt/splitters.py +++ b/src/unitxt/splitters.py @@ -1,10 +1,11 @@ import itertools from abc import abstractmethod from copy import deepcopy -from random import Random -from typing import Dict, List +from difflib import get_close_matches +from typing import Dict, List, Optional from .artifact import Artifact +from .dict_utils import dict_get from .operator import InstanceOperatorWithMultiStreamAccess, MultiStreamOperator from .random_utils import new_random_generator from .split_utils import ( @@ -15,6 +16,7 @@ slice_streams, ) from .stream import EmptyStreamError, FaultyStreamError, MultiStream +from .type_utils import isoftype class Splitter(MultiStreamOperator): @@ -109,7 +111,6 @@ def process(self, multi_stream: MultiStream) -> MultiStream: class Sampler(Artifact): sample_size: int = None - random_generator: Random = new_random_generator(sub_seed="Sampler") def prepare(self): super().prepare() @@ -123,17 +124,15 @@ def set_size(self, size): size = int(size) self.sample_size = size - def init_new_random_generator(self): - self.random_generator = new_random_generator( - sub_seed="init_new_random_generator" - ) - @abstractmethod def sample( - self, instances_pool: List[Dict[str, object]] + self, instances_pool: List[Dict[str, object]], instance: Dict[str, object] ) -> List[Dict[str, object]]: pass + def get_random_generator_based_on_instance(self, instance): + return new_random_generator(sub_seed={**instance["input_fields"]}) + def filter_source_by_instance( self, instances_pool: List[Dict[str, object]], instance: Dict[str, object] ) -> List[Dict[str, object]]: @@ -151,11 +150,80 @@ def filter_source_by_instance( class RandomSampler(Sampler): + """Selects a random sample of instances.""" + + def sample( + self, + instances_pool: List[Dict[str, object]], + instance: Optional[Dict[str, object]], + ) -> List[Dict[str, object]]: + instances_pool = list(instances_pool) + random_generator = self.get_random_generator_based_on_instance(instance) + return random_generator.sample(instances_pool, self.sample_size) + + +class FixedIndicesSampler(Sampler): + """Selects a fix set of samples based on a list of indices.""" + + indices: List[int] + + def verify(self): + assert isoftype( + self.indices, List[int] + ), f"'indices' of {self.__class__.__name__} must be List[int]. Value {self.indices} is of type {type(self.indices)}" + super().verify() + + def sample( + self, + instances_pool: List[Dict[str, object]], + instance: Optional[Dict[str, object]], + ) -> List[Dict[str, object]]: + num_instances = len(instances_pool) + + instances = [] + for index in self.indices[0 : self.sample_size]: + if index >= num_instances: + raise ValueError( + f"FixedIndicesSampler 'indices' field contains index ({index}) which is out of bounds of the instance pool ( of size {num_instances})" + ) + instances.append(instances_pool[index]) + return instances + + +class CloseTextSampler(Sampler): + """Selects the samples of instances which are the closest textual match to the given instance. + + Comparison is done based on a given field in the instance. + + """ + + field: str + def sample( - self, instances_pool: List[Dict[str, object]] + self, instances_pool: List[Dict[str, object]], instance: Dict[str, object] ) -> List[Dict[str, object]]: + field = f"input_fields/{self.field}" + value = dict_get(instance, field) + instances_pool = list(instances_pool) - return self.random_generator.sample(instances_pool, self.sample_size) + + # Get 'sample_size' closest matchest texts based on field + options = [] + for instance_in_pool in instances_pool: + options.append(dict_get(instance_in_pool, field)) + closest_matches = get_close_matches( + value, options, n=self.sample_size, cutoff=0 + ) + # Randmly select 'sample_size' instances that are from the closest matches text + # (There may be multiple instance with same text in the given field, and the order returned is + # is also randomized ) + instances_pool = [ + instance_in_pool + for instance_in_pool in instances_pool + if dict_get(instance_in_pool, field) in closest_matches + ] + random_generator = self.get_random_generator_based_on_instance(instance) + return random_generator.sample(instances_pool, self.sample_size) class DiverseLabelsSampler(Sampler): @@ -237,12 +305,15 @@ def divide_by_repr(self, exemplars_pool): return labels def sample( - self, instances_pool: List[Dict[str, object]] + self, + instances_pool: List[Dict[str, object]], + instance: Optional[Dict[str, object]], ) -> List[Dict[str, object]]: if self.labels_cache is None: self.labels_cache = self.divide_by_repr(instances_pool) all_labels = list(self.labels_cache.keys()) - self.random_generator.shuffle(all_labels) + random_generator = self.get_random_generator_based_on_instance(instance) + random_generator.shuffle(all_labels) from collections import Counter if self.sample_size > len(instances_pool): @@ -263,10 +334,10 @@ def sample( result = [] for label, allocation in allocations.items(): - sample = self.random_generator.sample(self.labels_cache[label], allocation) + sample = random_generator.sample(self.labels_cache[label], allocation) result.extend(sample) - self.random_generator.shuffle(result) + random_generator.shuffle(result) return result @@ -300,7 +371,7 @@ def process( raise ValueError( f"Size of population to sample from: {len(source_stream)} is smaller than the needed sample_size: {self.sampler.sample_size}." ) - sampled_instances = self.sampler.sample(source_stream) + sampled_instances = self.sampler.sample(source_stream, instance) instance[self.target_field] = sampled_instances return instance except FaultyStreamError as e: diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py index 3b110644f..9d86c46b6 100644 --- a/src/unitxt/standard.py +++ b/src/unitxt/standard.py @@ -58,8 +58,6 @@ class BaseRecipe(Recipe, SourceSequentialOperator): def before_process_multi_stream(self): super().before_process_multi_stream() - if self.sampler: # e.g. when num_demos is 0, the sampler may not be initialized - self.sampler.init_new_random_generator() def verify(self): super().verify() @@ -96,6 +94,16 @@ def verify(self): raise ValueError( f"max_train_instances should not exceed loader_limit ({self.loader_limit}), Got max_train_instances={self.max_train_instances}" ) + if self.metrics is not None and not isinstance(self.metrics, List): + raise ValueError( + f"metrics must be a list of metrics. Got metrics = {self.metrics}" + ) + if self.postprocessors is not None and not isinstance( + self.postprocessors, List + ): + raise ValueError( + f"post processors must be a list of post processor. Got postprocessors = {self.postprocessors}" + ) def prepare_refiners(self): self.train_refiner.max_instances = self.max_train_instances @@ -352,7 +360,7 @@ class StandardRecipe(StandardRecipeWithIndexes): demos_taken_from (str, optional): Specifies from where the demos are taken. Default is "train". demos_field (str, optional): Field name for demos. Default is "demos". demos_removed_from_data (bool, optional): whether to remove the demos from the source data, Default is True - sampler (Sampler, optional): Sampler object to be used in the recipe. + sampler (Sampler, optional): The Sampler used to select the demonstrations when num_demos > 0. steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe. augmentor (Augmentor) : Augmentor to be used to pseudo randomly augment the source text instruction_card_index (int, optional): Index of instruction card to be used diff --git a/src/unitxt/stream_operators.py b/src/unitxt/stream_operators.py index 7a55ea63d..a6dcedf7a 100644 --- a/src/unitxt/stream_operators.py +++ b/src/unitxt/stream_operators.py @@ -82,18 +82,6 @@ def merge(self, multi_stream) -> List: left_stream_df = pd.DataFrame(left_stream) right_stream_df = pd.DataFrame(right_stream) - # Remove common col we don't join on, so we don't have unexpected column (standard behavior is to add a suffix) - common_cols = set(left_stream_df.columns).intersection( - set(right_stream_df.columns) - ) - on = self.on if self.on is not None else [] - left_on = self.left_on if self.left_on is not None else [] - right_on = self.right_on if self.right_on is not None else [] - on_cols = set(on + left_on + right_on) - col_to_remove = list(common_cols - on_cols) - left_stream_df = left_stream_df.drop(columns=col_to_remove, errors="ignore") - right_stream_df = right_stream_df.drop(columns=col_to_remove, errors="ignore") - merged_df = pd.merge( left_stream_df, right_stream_df, @@ -102,6 +90,33 @@ def merge(self, multi_stream) -> List: left_on=self.left_on, right_on=self.right_on, ) + + def assert_col_values_are_identical( + df: pd.DataFrame, col_name_1: str, col_name_2 + ): + assert df.apply( + lambda row: str(row[col_name_1]) == str(row[col_name_2]), + axis=1, + ).all() + + # If 2 streams / Dataframes contains column with the same names, which are not the columns the join is operated + # on they will be renamed to "[column_name]_x" and "[column_name]_y". Some of these columns are metadsta + # columns that unitxt adds, which must be kept the same. This code verify that all datasets have + # the same metadata values and rename the columns accordingly. + common_cols_to_verify = ["data_classification_policy", "recipe_metadata"] + for common_col in common_cols_to_verify: + assert_col_values_are_identical( + merged_df, f"{common_col}_x", f"{common_col}_y" + ) + merged_df[common_col] = merged_df[f"{common_col}_x"] + merged_df = merged_df.drop( + columns=[f"{common_col}_x", f"{common_col}_y"], errors="ignore" + ) + + assert len(merged_df) > 0, ( + "JoinStreams resulted in an empty stream." + " If you used 'loader_limit' it might be the cause of the error" + ) return merged_df.to_dict(orient="records") def process(self, multi_stream: MultiStream) -> MultiStream: @@ -124,3 +139,21 @@ def process(self, multi_stream: MultiStream) -> MultiStream: key: val for key, val in multi_stream.items() if key not in self.splits } return MultiStream(generators) + + +class DuplicateSplit(MultiStreamOperator): + """Operator which duplicate a split. + + Attributes: + split (str): The split to duplicate from the stream. + to_split (str): The duplicate split's name. + """ + + split: str + to_split: str + + def process(self, multi_stream: MultiStream) -> MultiStream: + assert self.split in multi_stream + generators = multi_stream + generators[self.to_split] = generators[self.split] + return MultiStream(generators) diff --git a/src/unitxt/task.py b/src/unitxt/task.py index bbe26620d..567672e27 100644 --- a/src/unitxt/task.py +++ b/src/unitxt/task.py @@ -3,17 +3,33 @@ from .artifact import fetch_artifact from .dataclass import DeprecatedField +from .deprecation_utils import deprecation from .logging_utils import get_logger from .operator import InstanceOperator from .type_utils import ( + Type, get_args, get_origin, + is_type_dict, isoftype, + parse_type_dict, parse_type_string, + to_type_dict, + to_type_string, verify_required_schema, ) +@deprecation( + version="2.0.0", + msg="use python type instead of type strings (e.g Dict[str] instead of 'Dict[str]')", +) +def parse_string_types_instead_of_actual_objects(obj): + if isinstance(obj, dict): + return parse_type_dict(obj) + return parse_type_string(obj) + + class Task(InstanceOperator): """Task packs the different instance fields into dictionaries by their roles in the task. @@ -34,27 +50,27 @@ class Task(InstanceOperator): Will not overwrite values if already provided in a given instance. The output instance contains three fields: - "inputs" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'input_fields'. - "outputs" -- for the fields listed in Arg "outputs". + "input_fields" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'input_fields'. + "reference_fields" -- for the fields listed in Arg "reference_fields". "metrics" -- to contain the value of Arg 'metrics' """ - input_fields: Optional[Union[Dict[str, str], List[str]]] = None - reference_fields: Optional[Union[Dict[str, str], List[str]]] = None - inputs: Union[Dict[str, str], List[str]] = DeprecatedField( + input_fields: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None + reference_fields: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None + inputs: Union[Dict[str, Type], Dict[str, str], List[str]] = DeprecatedField( default=None, metadata={ "deprecation_msg": "The 'inputs' field is deprecated. Please use 'input_fields' instead." }, ) - outputs: Union[Dict[str, str], List[str]] = DeprecatedField( + outputs: Union[Dict[str, Type], Dict[str, str], List[str]] = DeprecatedField( default=None, metadata={ "deprecation_msg": "The 'outputs' field is deprecated. Please use 'reference_fields' instead." }, ) metrics: List[str] - prediction_type: Optional[str] = None + prediction_type: Optional[Union[Type, str]] = None augmentable_inputs: List[str] = [] defaults: Optional[Dict[str, Any]] = None @@ -76,6 +92,19 @@ def prepare(self): self.reference_fields if self.reference_fields is not None else self.outputs ) + if isoftype(self.input_fields, Dict[str, str]): + self.input_fields = parse_string_types_instead_of_actual_objects( + self.input_fields + ) + if isoftype(self.reference_fields, Dict[str, str]): + self.reference_fields = parse_string_types_instead_of_actual_objects( + self.reference_fields + ) + if isinstance(self.prediction_type, str): + self.prediction_type = parse_string_types_instead_of_actual_objects( + self.prediction_type + ) + def verify(self): if self.input_fields is None: raise ValueError("Missing attribute in task: 'input_fields' not set.") @@ -88,14 +117,14 @@ def verify(self): else self.reference_fields ) - if not isoftype(data, Dict[str, str]): + if isinstance(data, list) or not is_type_dict(data): get_logger().warning( f"'{io_type}' field of Task should be a dictionary of field names and their types. " - f"For example, {{'text': 'str', 'classes': 'List[str]'}}. Instead only '{data}' was " + f"For example, {{'text': str, 'classes': List[str]}}. Instead only '{data}' was " f"passed. All types will be assumed to be 'Any'. In future version of unitxt this " f"will raise an exception." ) - data = {key: "Any" for key in data} + data = {key: Any for key in data} if io_type == "input_fields": self.input_fields = data else: @@ -108,7 +137,7 @@ def verify(self): "Setting `prediction_type` to 'Any' (no checking is done). In future version " "of unitxt this will raise an exception." ) - self.prediction_type = "Any" + self.prediction_type = Any self.check_metrics_type() @@ -119,14 +148,35 @@ def verify(self): self.verify_defaults() + @classmethod + def process_data_after_load(cls, data): + possible_dicts = ["inputs", "input_fields", "outputs", "reference_fields"] + for dict_name in possible_dicts: + if dict_name in data and isinstance(data[dict_name], dict): + data[dict_name] = parse_type_dict(data[dict_name]) + if "prediction_type" in data: + data["prediction_type"] = parse_type_string(data["prediction_type"]) + return data + + def process_data_before_dump(self, data): + possible_dicts = ["inputs", "input_fields", "outputs", "reference_fields"] + for dict_name in possible_dicts: + if dict_name in data and isinstance(data[dict_name], dict): + if not isoftype(data[dict_name], Dict[str, str]): + data[dict_name] = to_type_dict(data[dict_name]) + if "prediction_type" in data: + if not isinstance(data["prediction_type"], str): + data["prediction_type"] = to_type_string(data["prediction_type"]) + return data + @staticmethod @lru_cache(maxsize=None) def get_metric_prediction_type(metric_id: str): metric = fetch_artifact(metric_id)[0] - return metric.get_prediction_type() + return metric.prediction_type def check_metrics_type(self) -> None: - prediction_type = parse_type_string(self.prediction_type) + prediction_type = self.prediction_type for metric_id in self.metrics: metric_prediction_type = Task.get_metric_prediction_type(metric_id) @@ -152,13 +202,13 @@ def verify_defaults(self): raise ValueError( f"If specified, the 'defaults' must be a dictionary, " f"however, '{self.defaults}' was provided instead, " - f"which is of type '{type(self.defaults)}'." + f"which is of type '{to_type_string(type(self.defaults))}'." ) for default_name, default_value in self.defaults.items(): assert isinstance(default_name, str), ( f"If specified, all keys of the 'defaults' must be strings, " - f"however, the key '{default_name}' is of type '{type(default_name)}'." + f"however, the key '{default_name}' is of type '{to_type_string(type(default_name))}'." ) val_type = self.input_fields.get( @@ -171,9 +221,9 @@ def verify_defaults(self): f"was provided which does not match any of the keys." ) - assert isoftype(default_value, parse_type_string(val_type)), ( + assert isoftype(default_value, val_type), ( f"The value of '{default_name}' from the 'defaults' must be of " - f"type '{val_type}', however, it is of type '{type(default_value)}'." + f"type '{to_type_string(val_type)}', however, it is of type '{to_type_string(type(default_value))}'." ) def set_default_values(self, instance: Dict[str, Any]) -> Dict[str, Any]: @@ -201,5 +251,6 @@ def process( } +@deprecation(version="2.0.0", alternative=Task) class FormTask(Task): pass diff --git a/src/unitxt/templates.py b/src/unitxt/templates.py index 7ef322b55..2fc445bf4 100644 --- a/src/unitxt/templates.py +++ b/src/unitxt/templates.py @@ -28,7 +28,7 @@ class Template(InstanceOperator): Args: skip_rendered_instance (bool): if "source", "target", and "references" are already defined fields in the instance, skip its processing postprocessors: a list of strings being artifact names of text processors, to be applied on the model output - instruction: a formatting string that yields an instruction with potential participation of values from the "inputs" part of the instance + instruction: a formatting string that yields an instruction with potential participation of values from the "input_fields" part of the instance target_prefix: a string to be used to format the prompt. Not a formatting string. """ @@ -41,19 +41,23 @@ class Template(InstanceOperator): target_prefix: str = NonPositionalField(default="") title_fields: List[str] = NonPositionalField(default_factory=list) - def inputs_to_instruction_and_target_prefix(self, inputs): + def input_fields_to_instruction_and_target_prefix(self, input_fields): instruction = self.apply_formatting( - inputs, "input", self.instruction, "instruction", serialize=True + input_fields, "input field", self.instruction, "instruction", serialize=True ) target_prefix = self.apply_formatting( - inputs, "input", self.target_prefix, "target_prefix", serialize=True + input_fields, + "input field", + self.target_prefix, + "target_prefix", + serialize=True, ) return instruction, target_prefix - def preprocess_inputs_and_outputs( - self, inputs: Dict[str, Any], outputs: Dict[str, Any] + def preprocess_input_and_reference_fields( + self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - return inputs, outputs + return input_fields, reference_fields def process( self, instance: Dict[str, Any], stream_name: Optional[str] = None @@ -66,20 +70,20 @@ def process( ): return instance - inputs = instance.get("inputs") - if inputs is None: - inputs = instance.get("input_fields") - outputs = instance.get("outputs") - if outputs is None: - outputs = instance.get("reference_fields") - inputs, outputs = self.preprocess_inputs_and_outputs(inputs, outputs) - - self.set_titles(inputs) - source = self.inputs_to_source(inputs) - instruction, target_prefix = self.inputs_to_instruction_and_target_prefix( - inputs + input_fields = instance.get("input_fields") + reference_fields = instance.get("reference_fields") + input_fields, reference_fields = self.preprocess_input_and_reference_fields( + input_fields, reference_fields + ) + + self.set_titles(input_fields) + source = self.input_fields_to_source(input_fields) + instruction, target_prefix = self.input_fields_to_instruction_and_target_prefix( + input_fields + ) + target, references = self.reference_fields_to_target_and_references( + reference_fields ) - target, references = self.outputs_to_target_and_references(outputs) return { **instance, @@ -91,7 +95,7 @@ def process( } @abstractmethod - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: + def input_fields_to_source(self, input_fields: Dict[str, object]) -> str: pass def set_titles(self, data): @@ -99,8 +103,8 @@ def set_titles(self, data): data[field] = data[field].title() @abstractmethod - def outputs_to_target_and_references( - self, outputs: Dict[str, object] + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] ) -> Tuple[str, List[str]]: pass @@ -129,20 +133,32 @@ def apply_formatting( class InputOutputTemplate(Template): """Generate field 'source' from fields designated as input, and fields 'target' and 'references' from fields designated as output, of the processed instance. - Args specify the formatting strings with which to glue together the input and output designated fields of the processed instance into one string ('source' and 'target'), and into a list of strings ('references'). + Args specify the formatting strings with which to glue together the input and reference fields of the processed instance into one string ('source' and 'target'), and into a list of strings ('references'). """ input_format: str output_format: str = None - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: + def input_fields_to_source( + self, input_fields: Dict[str, object] + ) -> Tuple[str, str]: return self.apply_formatting( - inputs, "input", self.input_format, "input_format", serialize=True + input_fields, + "input field", + self.input_format, + "input_format", + serialize=True, ) - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: target = self.apply_formatting( - outputs, "output", self.output_format, "output_format", serialize=True + reference_fields, + "reference field", + self.output_format, + "output_format", + serialize=True, ) references = [target] return target, references @@ -151,12 +167,22 @@ def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: class InputOutputTemplateWithCustomTarget(InputOutputTemplate): reference: str - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: target = self.apply_formatting( - outputs, "output", self.output_format, "output_format", serialize=True + reference_fields, + "reference field", + self.output_format, + "output_format", + serialize=True, ) reference = self.apply_formatting( - outputs, "output", self.reference, "reference", serialize=True + reference_fields, + "reference field", + self.reference, + "reference", + serialize=True, ) return target, [reference] @@ -193,46 +219,52 @@ class PairwiseChoiceTemplate(InputOutputTemplate): choice_tie_label: str shuffle: bool - def verbalize_answer_field(self, outputs: Dict[str, object]): - answer = outputs[self.answer_field] + def verbalize_answer_field(self, reference_fields: Dict[str, object]): + answer = reference_fields[self.answer_field] assert answer in ["choice_a", "choice_b", "tie"] if answer == "choice_a": - outputs[self.answer_field] = self.choice_a_label + reference_fields[self.answer_field] = self.choice_a_label elif answer == "choice_b": - outputs[self.answer_field] = self.choice_b_label + reference_fields[self.answer_field] = self.choice_b_label else: - outputs[self.answer_field] = self.choice_tie_label + reference_fields[self.answer_field] = self.choice_tie_label - return outputs + return reference_fields - def shuffle_values(self, inputs: Dict[str, object], outputs: Dict[str, object]): + def shuffle_values( + self, input_fields: Dict[str, object], reference_fields: Dict[str, object] + ): + if not self.shuffle: + return input_fields, reference_fields outcome = random() # A float between 0 and 1 if outcome <= 0.5: - choice_a_value = inputs[self.choice_a_field] - choice_b_value = inputs[self.choice_b_field] + choice_a_value = input_fields[self.choice_a_field] + choice_b_value = input_fields[self.choice_b_field] - inputs[self.choice_a_field] = choice_a_value - inputs[self.choice_b_field] = choice_b_value + input_fields[self.choice_a_field] = choice_b_value + input_fields[self.choice_b_field] = choice_a_value - answer = outputs[self.answer_field] + answer = reference_fields[self.answer_field] assert answer in [ self.choice_a_label, self.choice_b_label, self.choice_tie_label, ] if answer == self.choice_a_label: - outputs[self.answer_field] = self.choice_b_label + reference_fields[self.answer_field] = self.choice_b_label elif answer == self.choice_b_label: - outputs[self.answer_field] = self.choice_a_label + reference_fields[self.answer_field] = self.choice_a_label - return inputs, outputs + return input_fields, reference_fields - def preprocess_inputs_and_outputs( - self, inputs: Dict[str, Any], outputs: Dict[str, Any] + def preprocess_input_and_reference_fields( + self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - outputs = self.verbalize_answer_field(outputs) - inputs, outputs = self.shuffle_values(inputs, outputs) - return inputs, outputs + reference_fields = self.verbalize_answer_field(reference_fields) + input_fields, reference_fields = self.shuffle_values( + input_fields, reference_fields + ) + return input_fields, reference_fields class DialogFieldsData(Artifact): @@ -247,9 +279,9 @@ class DialogTemplate(InputOutputTemplate): turns_separator: str = "\n\n" label_separator: str = " " - def process_dialog(self, inputs: Dict[str, object]): + def process_dialog(self, input_fields: Dict[str, object]): for dialog_fields in self.dialog_fields: - dialog = inputs[dialog_fields.dialog_field] + dialog = input_fields[dialog_fields.dialog_field] # TODO: update isoftype method to support Literal verification and check # it's List[Tuple[Literal["user", "assistant", "system"], str]] (Issue #799) assert isoftype(dialog, List[Tuple[str, str]]) @@ -269,25 +301,81 @@ def process_dialog(self, inputs: Dict[str, object]): elif turn_type == "system": dialog_str += f"{turns_separator}{system_role_label}{self.label_separator}{turn_text}" - inputs[dialog_fields.dialog_field] = dialog_str - return inputs + input_fields[dialog_fields.dialog_field] = dialog_str + return input_fields - def preprocess_inputs_and_outputs( - self, inputs: Dict[str, Any], outputs: Dict[str, Any] + def preprocess_input_and_reference_fields( + self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - return self.process_dialog(inputs), outputs + return self.process_dialog(input_fields), reference_fields class DialogPairwiseChoiceTemplate(DialogTemplate, PairwiseChoiceTemplate): - def preprocess_inputs_and_outputs( - self, inputs: Dict[str, Any], outputs: Dict[str, Any] + def preprocess_input_and_reference_fields( + self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, outputs = DialogTemplate.preprocess_inputs_and_outputs( - self, inputs, outputs + inputs, reference_fields = DialogTemplate.preprocess_input_and_reference_fields( + self, input_fields, reference_fields + ) + return PairwiseChoiceTemplate.preprocess_input_and_reference_fields( + self, input_fields, reference_fields ) - return PairwiseChoiceTemplate.preprocess_inputs_and_outputs( - self, inputs, outputs + + +class PairwiseComparativeRatingTemplate(InputOutputTemplate): + """PairwiseChoiceTemplate. + + Args: + choice_a_field (str): The field which contains choice_a value + choice_b_field (str): The field which contains choice_b value + answer_field (str): The field which contains the answer value. The value should be an int. + Positive for preferring choice_a, and negative for preferring choice_b + shuffle (bool): whether to shuffle the choices or not. This is done to take into account position bias. + + shuffle: 50% of the time: + 1) The values of choice_a_field and choice_b_field will be swapped. + 2) Replace the values of answer_field with its mapped value according to the reverse_preference_map Dict. + + """ + + choice_a_field: str + choice_b_field: str + choice_a_id_field: str + choice_b_id_field: str + answer_field: str + shuffle: bool + + def shuffle_values( + self, input_fields: Dict[str, object], reference_fields: Dict[str, object] + ): + if not self.shuffle: + return input_fields, reference_fields + outcome = random() # A float between 0 and 1 + if outcome <= 0.5: + choice_a_value = input_fields[self.choice_a_field] + choice_b_value = input_fields[self.choice_b_field] + input_fields[self.choice_a_field] = choice_b_value + input_fields[self.choice_b_field] = choice_a_value + + choice_a_id_value = input_fields[self.choice_a_id_field] + choice_b_id_value = input_fields[self.choice_b_id_field] + input_fields[self.choice_a_id_field] = choice_b_id_value + input_fields[self.choice_b_id_field] = choice_a_id_value + + assert isinstance(reference_fields[self.answer_field], int) + reference_fields[self.answer_field] = ( + int(reference_fields[self.answer_field]) * -1 + ) + + return input_fields, reference_fields + + def preprocess_input_and_reference_fields( + self, input_fields: Dict[str, Any], reference_fields: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + input_fields, reference_fields = self.shuffle_values( + input_fields, reference_fields ) + return input_fields, reference_fields class MultipleChoiceTemplate(Template): @@ -347,53 +435,61 @@ def inputs_to_choices(self, data: Dict[str, object], choice_format: str) -> str: ) return enumrated_choices - def inputs_to_numerals(self, inputs: Dict[str, object]) -> Tuple[str, str]: - return self.inputs_to_choices(inputs, "{choice_numeral}") + def inputs_to_numerals(self, input_fields: Dict[str, object]) -> Tuple[str, str]: + return self.inputs_to_choices(input_fields, "{choice_numeral}") def prepare_multiple_choice_inputs( - self, inputs: Dict[str, object] + self, input_fields: Dict[str, object] ) -> Dict[str, object]: - choices = self.inputs_to_choices(inputs, self.source_choice_format) + choices = self.inputs_to_choices(input_fields, self.source_choice_format) return { - "numerals": self.inputs_to_numerals(inputs), - **inputs, + "numerals": self.inputs_to_numerals(input_fields), + **input_fields, self.choices_field: self.choices_separator.join(choices), } - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: - inputs = self.prepare_multiple_choice_inputs(inputs) + def input_fields_to_source( + self, input_fields: Dict[str, object] + ) -> Tuple[str, str]: + input_fields = self.prepare_multiple_choice_inputs(input_fields) return self.apply_formatting( - inputs, "input", self.input_format, "input_format", serialize=True + input_fields, + "input field", + self.input_format, + "input_format", + serialize=True, ) - def inputs_to_instruction_and_target_prefix(self, inputs): - inputs = self.prepare_multiple_choice_inputs(inputs) - return super().inputs_to_instruction_and_target_prefix(inputs) + def input_fields_to_instruction_and_target_prefix(self, input_fields): + input_fields = self.prepare_multiple_choice_inputs(input_fields) + return super().input_fields_to_instruction_and_target_prefix(input_fields) - def outputs_to_target_index(self, outputs: Dict[str, object]) -> str: - target = outputs[self.target_field] + def outputs_to_target_index(self, reference_fields: Dict[str, object]) -> str: + target = reference_fields[self.target_field] if not isinstance(target, int): try: - return outputs[self.choices_field].index(target) + return reference_fields[self.choices_field].index(target) except ValueError as e: raise ValueError( - f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {outputs[self.choices_field]}" + f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {reference_fields[self.choices_field]}" ) from e return target - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: - target = outputs[self.target_field] + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: + target = reference_fields[self.target_field] if not isinstance(target, int): try: - target = outputs[self.choices_field].index(target) + target = reference_fields[self.choices_field].index(target) except ValueError as e: raise ValueError( - f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {outputs[self.choices_field]}" + f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {reference_fields[self.choices_field]}" ) from e - choices = self.inputs_to_choices(outputs, self.target_choice_format) + choices = self.inputs_to_choices(reference_fields, self.target_choice_format) try: target = choices[target] @@ -461,27 +557,35 @@ class YesNoTemplate(Template): yes_answer: str = "Yes" no_answer: str = "No" - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: + def input_fields_to_source( + self, input_fields: Dict[str, object] + ) -> Tuple[str, str]: return self.apply_formatting( - inputs, "input", self.input_format, "input_format", serialize=True + input_fields, + "input field", + self.input_format, + "input_format", + serialize=True, ) - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: try: - gold_class_names = outputs[self.label_field] + gold_class_names = reference_fields[self.label_field] except KeyError as e: raise RuntimeError( - f"Available outputs are {list(outputs.keys())}, missing required label field: '{self.label_field}'." + f"Available reference_fields are {list(reference_fields.keys())}, missing required label field: '{self.label_field}'." ) from e if not isinstance(gold_class_names, list): raise RuntimeError( f"Unexpected value for gold_class_names: '{gold_class_names}'. Expecting a list." ) try: - queried_class_name = outputs[self.class_field] + queried_class_name = reference_fields[self.class_field] except KeyError as e: raise RuntimeError( - f"Available outputs are {list(outputs.keys())}, missing required class field: '{self.class_field}'." + f"Available reference_fields are {list(reference_fields.keys())}, missing required class field: '{self.class_field}'." ) from e if not queried_class_name or not isinstance(queried_class_name, str): raise RuntimeError( @@ -514,17 +618,21 @@ def process_dict( pairs.append(key_val_sep.join(key_val)) return pairs_sep.join(pairs) - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: + def input_fields_to_source( + self, input_fields: Dict[str, object] + ) -> Tuple[str, str]: return self.process_dict( - inputs, + input_fields, key_val_sep=self.key_val_separator, pairs_sep=self.pairs_separator, use_keys=self.use_keys_for_inputs, ) - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: target = self.process_dict( - outputs, + reference_fields, key_val_sep=self.key_val_separator, pairs_sep=self.pairs_separator, use_keys=self.use_keys_for_outputs, @@ -535,32 +643,36 @@ def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: class OutputQuantizingTemplate(InputOutputTemplate): quantum: Union[float, int] = 0.1 # Now supports both int and float - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: if isinstance(self.quantum, int): # When quantum is an int, format quantized values as ints quantized_outputs = { key: f"{int(round(value / self.quantum) * self.quantum)}" - for key, value in outputs.items() + for key, value in reference_fields.items() } else: # When quantum is a float, format quantized values with precision based on quantum quantum_str = f"{self.quantum:.10f}".rstrip("0").rstrip(".") quantized_outputs = { key: f"{round(value / self.quantum) * self.quantum:{quantum_str}}" - for key, value in outputs.items() + for key, value in reference_fields.items() } - return super().outputs_to_target_and_references(quantized_outputs) + return super().reference_fields_to_target_and_references(quantized_outputs) class MultiLabelTemplate(InputOutputTemplate): labels_field: str = "labels" labels_separator: str = ", " - postprocessors: List[str] = ["processors.to_list_by_comma"] + postprocessors = ["processors.to_list_by_comma"] output_format: str = "{labels}" empty_label: str = "None" - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: - labels = outputs[self.labels_field] + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> str: + labels = reference_fields[self.labels_field] if not isinstance(labels, list): raise ValueError( f"MultiLabelTemplate requires labels field '{self.labels_field}' to be a list. Got {self.labels_field}<{type(labels).__name__}>: {labels}" @@ -568,15 +680,19 @@ def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str: if len(labels) == 0: labels = [self.empty_label] labels_str = self.labels_separator.join(labels) - return super().outputs_to_target_and_references({self.labels_field: labels_str}) + return super().reference_fields_to_target_and_references( + {self.labels_field: labels_str} + ) class MultiReferenceTemplate(InputOutputTemplate): references_field: str = "references" random_reference: bool = False - def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> List[str]: - references = outputs[self.references_field] + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] + ) -> List[str]: + references = reference_fields[self.references_field] if not isoftype(references, List[str]): raise ValueError( f"MultiReferenceTemplate requires references field '{self.references_field}' to be List[str]. Got {self.references_field}<{type(references).__name__}>: {references}" @@ -587,7 +703,7 @@ def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> List[s ) if self.random_reference: - random_generator = new_random_generator(outputs) + random_generator = new_random_generator(reference_fields) target = random_generator.choice(references) else: target = references[0] @@ -607,11 +723,11 @@ class SpanLabelingBaseTemplate(MultiLabelTemplate): text_field: str = "text" labels_support: list = None - def extract_span_label_pairs(self, outputs): - spans_starts = outputs[self.spans_starts_field] - spans_ends = outputs[self.spans_ends_field] - text = outputs[self.text_field] - labels = outputs[self.labels_field] + def extract_span_label_pairs(self, reference_fields): + spans_starts = reference_fields[self.spans_starts_field] + spans_ends = reference_fields[self.spans_ends_field] + text = reference_fields[self.text_field] + labels = reference_fields[self.labels_field] spans = [] for span_start, span_end, label in zip(spans_starts, spans_ends, labels): @@ -622,12 +738,12 @@ def extract_span_label_pairs(self, outputs): if self.labels_support is None or span[3] in self.labels_support: yield span[2], span[3] - def outputs_to_target_and_references( - self, outputs: Dict[str, object] + def reference_fields_to_target_and_references( + self, reference_fields: Dict[str, object] ) -> Dict[str, object]: - span_labels_pairs = self.extract_span_label_pairs(outputs) + span_labels_pairs = self.extract_span_label_pairs(reference_fields) targets = self.span_label_pairs_to_targets(span_labels_pairs) - return super().outputs_to_target_and_references({"labels": targets}) + return super().reference_fields_to_target_and_references({"labels": targets}) @abstractmethod def span_label_pairs_to_targets(self, pairs): diff --git a/src/unitxt/test_utils/metrics.py b/src/unitxt/test_utils/metrics.py index 912f6d65c..bce8b5b3b 100644 --- a/src/unitxt/test_utils/metrics.py +++ b/src/unitxt/test_utils/metrics.py @@ -147,6 +147,12 @@ def test_evaluate( task_data: Optional[List[dict]], metric_name: str, ): + if settings.test_metric_disable: + logger.info( + "test_evaluate() functionality is disabled because unitxt.settings.test_metric_disable=True or UNITXT_TEST_METRIC_DISABLE environment variable is set" + ) + return + evaluation_result, global_outputs = evaluate( task_data, metric_names=[metric_name], compute_conf_intervals=True ) diff --git a/src/unitxt/type_utils.py b/src/unitxt/type_utils.py index afb2f7a06..108d13aef 100644 --- a/src/unitxt/type_utils.py +++ b/src/unitxt/type_utils.py @@ -7,6 +7,58 @@ from .utils import safe_eval +_supported_types_strings = [ + "Any", + "List[...]", + "Dict[...]", + "Tuple[...]", + "Union[...]", + "Optional[...]", + "int", + "float", + "dict", + "double", + "str", +] + +Type = typing.Any + + +class UnsupportedTypeError(ValueError): + def __init__(self, type_object): + supported_types = ", ".join(_supported_types_strings) + super().__init__( + f"Type: '{type_object!s}' is not supported type. Use one of {supported_types}" + ) + + +_generics = [ + typing.List[typing.Any], + typing.Dict[typing.Any, typing.Any], + typing.Tuple[typing.Any], + typing.Union[typing.Any, typing.Any], + typing.Optional[typing.Any], + typing.Any, +] + +_generics_types = [type(t) for t in _generics] + + +def is_type(object): + return isinstance(object, (type, *_generics_types)) + + +def is_type_dict(object): + if not isinstance(object, dict): + raise ValueError("Should be dict.") + for value in object.values(): + if isinstance(value, dict): + if not is_type_dict(value): + return False + elif not is_type(value): + return False + return True + def convert_union_type(type_string: str) -> str: """Converts Python 3.10 union type hints into form compatible with Python 3.9 version. @@ -182,6 +234,43 @@ def parse_type_string(type_string: str) -> typing.Any: return safe_eval(type_string, safe_context, safe_tokens) +def to_type_string(typing_type): + if not is_type(typing_type): + raise UnsupportedTypeError(typing_type) + type_string = ( + str(typing_type) + .replace("typing.", "") + .replace("", "") + ) + assert parse_type_string(type_string), "Is not parsed well" + return type_string + + +def to_type_dict(dict_of_typing_types): + result = {} + for key, val in dict_of_typing_types.items(): + if isinstance(val, dict): + result[key] = to_type_dict(val) + else: + result[key] = to_type_string(val) + return result + + +def parse_type_dict(type_dict): + results = {} + for k, v in type_dict.items(): + if isinstance(v, str): + results[k] = parse_type_string(v) + elif isinstance(v, dict): + results[k] = parse_type_dict(v) + else: + raise ValueError( + f"Can parse only nested dictionary with type strings, got {type(v)}" + ) + return results + + def infer_type(obj) -> typing.Any: return parse_type_string(infer_type_string(obj)) @@ -355,7 +444,7 @@ def encode_a_list_of_type_names(list_of_type_names: typing.List[str]) -> str: return "Any" -def isoftype(object, type): +def isoftype(object, typing_type): """Checks if an object is of a certain typing type, including nested types. This function supports simple types (like `int`, `str`), typing types @@ -364,7 +453,7 @@ def isoftype(object, type): Args: object: The object to check. - type: The typing type to check against. + typing_type: The typing type to check against. Returns: bool: True if the object is of the specified type, False otherwise. @@ -378,12 +467,15 @@ def isoftype(object, type): isoftype([1, 2, 3], typing.List[str]) # False isoftype([[1, 2], [3, 4]], typing.List[typing.List[int]]) # True """ - if type == typing.Any: + if not is_type(typing_type): + raise UnsupportedTypeError(typing_type) + + if typing_type == typing.Any: return True - if hasattr(type, "__origin__"): - origin = type.__origin__ - type_args = typing.get_args(type) + if hasattr(typing_type, "__origin__"): + origin = typing_type.__origin__ + type_args = typing.get_args(typing_type) if origin is typing.Union: return any(isoftype(object, sub_type) for sub_type in type_args) @@ -406,7 +498,7 @@ def isoftype(object, type): ) return None - return isinstance(object, type) + return isinstance(object, typing_type) # copied from: https://github.com/bojiang/typing_utils/blob/main/typing_utils/__init__.py @@ -476,12 +568,12 @@ def _hashable(value): GenericClass = type(typing.List) UnionClass = type(typing.Union) -Type = typing.Union[None, type, "typing.TypeVar"] +_Type = typing.Union[None, type, "typing.TypeVar"] OriginType = typing.Union[None, type] TypeArgs = typing.Union[type, typing.AbstractSet[type], typing.Sequence[type]] -def _normalize_aliases(type_: Type) -> Type: +def _normalize_aliases(type_: _Type) -> _Type: if isinstance(type_, typing.TypeVar): return type_ @@ -600,7 +692,7 @@ def eval_forward_ref(ref, forward_refs=None): class NormalizedType(typing.NamedTuple): """Normalized type, made it possible to compare, hash between types.""" - origin: Type + origin: _Type args: typing.Union[tuple, frozenset] = () def __eq__(self, other): @@ -635,7 +727,7 @@ def _normalize_args(tps: TypeArgs): return normalize(tps) -def normalize(type_: Type) -> NormalizedType: +def normalize(type_: _Type) -> NormalizedType: """Convert types to NormalizedType instances.""" args = get_args(type_) origin = get_origin(type_) @@ -795,8 +887,8 @@ def _is_normal_subtype( def issubtype( - left: Type, - right: Type, + left: _Type, + right: _Type, forward_refs: typing.Optional[dict] = None, ) -> typing.Optional[bool]: """Check that the left argument is a subtype of the right. @@ -844,7 +936,7 @@ def to_float_or_default(v, failure_default=0): def verify_required_schema( - required_schema_dict: typing.Dict[str, str], + required_schema_dict: typing.Dict[str, type], input_dict: typing.Dict[str, typing.Any], ) -> None: """Verifies if passed input_dict has all required fields, and they are of proper types according to required_schema_dict. @@ -856,7 +948,7 @@ def verify_required_schema( input_dict (Dict[str, Any]): Dict with input fields and their respective values. """ - for field_name, data_type_string in required_schema_dict.items(): + for field_name, data_type in required_schema_dict.items(): try: value = input_dict[field_name] except KeyError as e: @@ -865,10 +957,8 @@ def verify_required_schema( f"The available names: {list(input_dict.keys())}." ) from e - data_type = parse_type_string(data_type_string) - if not isoftype(value, data_type): raise ValueError( f"Passed value '{value}' of field '{field_name}' is not " - f"of required type: ({data_type_string})." + f"of required type: ({to_type_string(data_type)})." ) diff --git a/tests/catalog/test_preparation.py b/tests/catalog/test_preparation.py index 07b248082..caec4d2d3 100644 --- a/tests/catalog/test_preparation.py +++ b/tests/catalog/test_preparation.py @@ -1,12 +1,11 @@ import glob import os import time -from datetime import timedelta from huggingface_hub.utils import GatedRepoError from unitxt.loaders import MissingKaggleCredentialsError from unitxt.logging_utils import get_logger -from unitxt.settings_utils import get_constants +from unitxt.settings_utils import get_constants, get_settings from unitxt.text_utils import print_dict from unitxt.utils import import_module_from_file @@ -14,6 +13,7 @@ logger = get_logger() constants = get_constants() +setting = get_settings() project_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -55,7 +55,9 @@ def test_preparations(self): self.assertTrue(True) elapsed_time = time.time() - start_time - formatted_time = str(timedelta(seconds=elapsed_time)) + minutes = int(elapsed_time // 60) + seconds = int(elapsed_time % 60) + formatted_time = f"{minutes:02}:{seconds:02}" logger.info( "\n_____________________________________________\n" f" Finished testing preparation file:\n {file}." @@ -63,10 +65,11 @@ def test_preparations(self): "\n_____________________________________________\n" ) - times[file] = formatted_time + times[file.split("prepare")[-1]] = formatted_time except Exception as e: logger.critical(f"Testing preparation file '{file}' failed:") raise e logger.critical("Preparation times table:") + times = dict(sorted(times.items(), key=lambda item: item[1], reverse=True)) print_dict(times, log_level="critical") diff --git a/tests/library/test_api.py b/tests/library/test_api.py index aa2421eee..067e51f93 100644 --- a/tests/library/test_api.py +++ b/tests/library/test_api.py @@ -24,7 +24,7 @@ def test_load_dataset(self): '"min_value": 1.0, ' '"max_value": 5.0, ' '"attribute_value": 5.0, ' - '"metadata": {"template": "templates.regression.two_texts.simple"}}', + '"metadata": {"data_classification_policy": ["public"], "template": "templates.regression.two_texts.simple"}}', "group": "unitxt", "postprocessors": [ "processors.take_first_non_empty_line", @@ -53,7 +53,10 @@ def test_evaluate(self): "min_value": 1.0, "max_value": 5.0, "attribute_value": 5.0, - "metadata": {"template": "templates.regression.two_texts.simple"}, + "metadata": { + "data_classification_policy": ["public"], + "template": "templates.regression.two_texts.simple", + }, "source": "Given this sentence: 'A plane is taking off.', on a scale of 1.0 to 5.0, what is the similarity to this text 'An air plane is taking off.'?\n", }, "group": "unitxt", @@ -125,7 +128,14 @@ def test_produce_with_recipe(self): target = { "metrics": ["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], - "source": "Given a premise and hypothesis classify the entailment of the hypothesis to one of entailment, not entailment.\npremise: Steve follows Fred's example in everything. He influences him hugely.\nhypothesis: Steve influences him hugely.\nThe entailment class is entailment\n\npremise: The police arrested all of the gang members. They were trying to stop the drug trade in the neighborhood.\nhypothesis: The police were trying to stop the drug trade in the neighborhood.\nThe entailment class is not entailment\n\npremise: It works perfectly\nhypothesis: It works!\nThe entailment class is ", + "source": "Given a premise and hypothesis classify the entailment of the hypothesis to one of entailment, not entailment.\n" + "premise: When Tatyana reached the cabin, her mother was sleeping. " + "She was careful not to disturb her, undressing and climbing back " + "into her berth.\n" + "hypothesis: mother was careful not to disturb her, undressing and " + "climbing back into her berth.\n" + "The entailment class is entailment\n\n" + "premise: Steve follows Fred's example in everything. He influences him hugely.\nhypothesis: Steve influences him hugely.\nThe entailment class is entailment\n\npremise: It works perfectly\nhypothesis: It works!\nThe entailment class is ", "target": "?", "references": ["?"], "task_data": '{"text_a": "It works perfectly", ' @@ -135,7 +145,7 @@ def test_produce_with_recipe(self): '"classes": ["entailment", "not entailment"], ' '"type_of_relation": "entailment", ' '"label": "?", ' - '"metadata": {"template": "templates.classification.multi_class.relation.default"}}', + '"metadata": {"data_classification_policy": [], "template": "templates.classification.multi_class.relation.default"}}', "group": "unitxt", "postprocessors": [ "processors.take_first_non_empty_line", @@ -164,7 +174,14 @@ def test_produce_with_recipe_with_list_of_instances(self): target = { "metrics": ["metrics.f1_micro", "metrics.accuracy", "metrics.f1_macro"], - "source": "Given a premise and hypothesis classify the entailment of the hypothesis to one of entailment, not entailment.\npremise: Steve follows Fred's example in everything. He influences him hugely.\nhypothesis: Steve influences him hugely.\nThe entailment class is entailment\n\npremise: The police arrested all of the gang members. They were trying to stop the drug trade in the neighborhood.\nhypothesis: The police were trying to stop the drug trade in the neighborhood.\nThe entailment class is not entailment\n\npremise: It works perfectly\nhypothesis: It works!\nThe entailment class is ", + "source": "Given a premise and hypothesis classify the entailment of the hypothesis to one of entailment, not entailment.\n" + "premise: When Tatyana reached the cabin, her mother was sleeping. " + "She was careful not to disturb her, undressing and climbing back " + "into her berth.\n" + "hypothesis: mother was careful not to disturb her, undressing and " + "climbing back into her berth.\n" + "The entailment class is entailment\n\n" + "premise: Steve follows Fred's example in everything. He influences him hugely.\nhypothesis: Steve influences him hugely.\nThe entailment class is entailment\n\npremise: It works perfectly\nhypothesis: It works!\nThe entailment class is ", "target": "?", "references": ["?"], "task_data": '{"text_a": "It works perfectly", ' @@ -174,7 +191,7 @@ def test_produce_with_recipe_with_list_of_instances(self): '"classes": ["entailment", "not entailment"], ' '"type_of_relation": "entailment", ' '"label": "?", ' - '"metadata": {"template": "templates.classification.multi_class.relation.default"}}', + '"metadata": {"data_classification_policy": [], "template": "templates.classification.multi_class.relation.default"}}', "group": "unitxt", "postprocessors": [ "processors.take_first_non_empty_line", diff --git a/tests/library/test_dataclass.py b/tests/library/test_dataclass.py index 396dd56a9..fcbc37319 100644 --- a/tests/library/test_dataclass.py +++ b/tests/library/test_dataclass.py @@ -1,4 +1,5 @@ from dataclasses import field +from typing import Callable from unitxt.dataclass import ( AbstractField, @@ -16,6 +17,7 @@ fields, fields_names, is_abstract_field, + is_class_method, is_final_field, ) @@ -276,6 +278,30 @@ class Child(Mixin, Parent1): self.assertEqual(child.b, 2) self.assertEqual(child.c, 3) + def test_filling_requirement_with_mixin_and_funcs(self): + class GrandParent(Dataclass): + t: Callable = lambda: 2 + + class Parent1(GrandParent): + b: int = 2 + + class Mixin(Dataclass): + a: int = 2 + + class Child(Mixin, Parent1): + c: int + t = lambda: 5 + + class GrandChild(Child): + c = 7 + pass + + child = GrandChild(b=2, c=3) + + self.assertEqual(child.t(), 5) + self.assertEqual(child.b, 2) + self.assertEqual(child.c, 3) + def test_raising_unexpected_keyword_argument_error(self): class Dummy(Dataclass): b = 1 # not a field!!! @@ -358,3 +384,23 @@ class DataclassB(DataclassA): dataclass_b.to_dict(classes=[dataclass_b]), {"b": "", "c": False}, ) + + def test_is_class_method(self): + def func(x): + return x + + class MyClass: + my_lambda = lambda x: x + my_func = func + + @classmethod + def my_class_method(cls): + pass + + def my_instance_method(self): + pass + + self.assertTrue(is_class_method(MyClass.my_class_method)) + # self.assertTrue(is_class_method(MyClass.my_instance_method)) + # self.assertFalse(is_class_method(MyClass.my_lambda)) + # self.assertFalse(is_class_method(MyClass.my_func)) diff --git a/tests/library/test_examples.py b/tests/library/test_examples.py index ac816b2a3..006f17006 100644 --- a/tests/library/test_examples.py +++ b/tests/library/test_examples.py @@ -37,8 +37,12 @@ def test_examples(self): "evaluate_summarization_dataset_llm_as_judge.py", "evaluate_different_formats.py", "evaluate_different_templates.py", + "evaluate_different_demo_selections.py", "evaluate_dataset_by_llm_as_judge_no_install.py", + "evaluate_a_judge_model_capabilities_on_arena_hard.py", + "evaluate_a_model_using_arena_hard.py", "evaluate_llm_as_judge.py", + "evaluate_using_metrics_ensemble.py", ] for file in all_example_files: logger.info( diff --git a/tests/library/test_format_and_template_interaction.py b/tests/library/test_format_and_template_interaction.py index 634c8605c..29b0a9b1b 100644 --- a/tests/library/test_format_and_template_interaction.py +++ b/tests/library/test_format_and_template_interaction.py @@ -8,7 +8,10 @@ class TestFormatAndTemplateInteraction(UnitxtTestCase): def test_interactions(self): - instance = {"inputs": {"question": "what?"}, "outputs": {"answer": "that!"}} + instance = { + "input_fields": {"question": "what?"}, + "reference_fields": {"answer": "that!"}, + } target = "that!" template_separated = InputOutputTemplate( diff --git a/tests/library/test_formats.py b/tests/library/test_formats.py index 8e339dd76..ee3351cf2 100644 --- a/tests/library/test_formats.py +++ b/tests/library/test_formats.py @@ -1,4 +1,10 @@ +from unitxt.card import TaskCard from unitxt.formats import HFSystemFormat, SystemFormat +from unitxt.loaders import LoadFromDictionary +from unitxt.standard import StandardRecipe +from unitxt.system_prompts import TextualSystemPrompt +from unitxt.task import Task +from unitxt.templates import InputOutputTemplate from unitxt.test_utils.operators import ( check_operator, ) @@ -11,8 +17,18 @@ def test_hf_system_format(self): instruction = "solve the math exercises" demo_instances = [ - {"source": "1+2", "target": "3", "instruction": instruction, "inputs": {}}, - {"source": "4-2", "target": "2", "instruction": instruction, "inputs": {}}, + { + "source": "1+2", + "target": "3", + "instruction": instruction, + "input_fields": {}, + }, + { + "source": "4-2", + "target": "2", + "instruction": instruction, + "input_fields": {}, + }, ] inputs = [ @@ -21,7 +37,7 @@ def test_hf_system_format(self): "target": "2", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, "target_prefix": "The answer is ", "system_prompt": "You are a smart assistant.", }, @@ -30,7 +46,7 @@ def test_hf_system_format(self): "target": "5", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, "target_prefix": "The answer is ", "system_prompt": "You are a smart assistant.", }, @@ -42,12 +58,12 @@ def test_hf_system_format(self): targets = [ { "target": "2", - "inputs": {}, + "input_fields": {}, "source": "<|system|>\nYou are a smart assistant.\nsolve the math exercises\n<|user|>\n1+2\n<|assistant|>\nThe answer is 3\n<|user|>\n4-2\n<|assistant|>\nThe answer is 2\n<|user|>\n1+1\n<|assistant|>\nThe answer is ", }, { "target": "5", - "inputs": {}, + "input_fields": {}, "source": "<|system|>\nYou are a smart assistant.\nsolve the math exercises\n<|user|>\n1+2\n<|assistant|>\nThe answer is 3\n<|user|>\n4-2\n<|assistant|>\nThe answer is 2\n<|user|>\n3+2\n<|assistant|>\nThe answer is ", }, ] @@ -63,8 +79,18 @@ def test_system_format(self): instruction = "solve the math exercises" demo_instances = [ - {"source": "1+2", "target": "3", "instruction": instruction, "inputs": {}}, - {"source": "4-2", "target": "2", "instruction": instruction, "inputs": {}}, + { + "source": "1+2", + "target": "3", + "instruction": instruction, + "input_fields": {}, + }, + { + "source": "4-2", + "target": "2", + "instruction": instruction, + "input_fields": {}, + }, ] inputs = [ @@ -73,28 +99,28 @@ def test_system_format(self): "target": "2", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "3+2", "target": "5", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "7-4", "target": "3", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "12-3", "target": "9", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, ] @@ -108,22 +134,22 @@ def test_system_format(self): targets = [ { "target": "2", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n1+1\nAgent: ", }, { "target": "5", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n3+2\nAgent: ", }, { "target": "3", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n7-4\nAgent: ", }, { "target": "9", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n12-3\nAgent: ", }, ] @@ -145,22 +171,22 @@ def test_system_format(self): targets = [ { "target": "2", - "inputs": {}, + "input_fields": {}, "source": "Instruction: solve the math exercises\n\nUser: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 1+1\nAgent: ", }, { "target": "5", - "inputs": {}, + "input_fields": {}, "source": "Instruction: solve the math exercises\n\nUser: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 3+2\nAgent: ", }, { "target": "3", - "inputs": {}, + "input_fields": {}, "source": "Instruction: solve the math exercises\n\nUser: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 7-4\nAgent: ", }, { "target": "9", - "inputs": {}, + "input_fields": {}, "source": "Instruction: solve the math exercises\n\nUser: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 12-3\nAgent: ", }, ] @@ -187,22 +213,22 @@ def test_system_format(self): targets_no_instruction = [ { "target": "2", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 1+1\nAgent: ", }, { "target": "5", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 3+2\nAgent: ", }, { "target": "3", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 7-4\nAgent: ", }, { "target": "9", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: 12-3\nAgent: ", }, ] @@ -218,7 +244,7 @@ def test_system_format(self): "source": 'This is my sentence: "was so bad"', "target": "negative", "references": ["negative"], - "inputs": {}, + "input_fields": {}, "instruction": "classify user sentence by its sentiment to either positive, or negative.", "demos": [ { @@ -247,7 +273,7 @@ def test_system_format(self): "source": 'Instruction:classify user sentence by its sentiment to either positive, or negative.\n\nUser:This is my sentence: "was so not good"\nAgent:negative\n\nUser:This is my sentence: "was so good"\nAgent:positive\n\nUser:This is my sentence: "was so bad"\nAgent:', "target": "negative", "references": ["negative"], - "inputs": {}, + "input_fields": {}, } self.assertDictEqual(result, target) @@ -256,7 +282,7 @@ def test_system_format(self): "source": 'This is my sentence: "was so bad"', "target": "negative", "references": ["negative"], - "inputs": {}, + "input_fields": {}, "instruction": "classify user sentence by its sentiment to either positive, or negative.", } system_format = SystemFormat( @@ -267,7 +293,7 @@ def test_system_format(self): target = { "source": 'Instruction:classify user sentence by its sentiment to either positive, or negative.\n\nUser:This is my sentence: "was so bad"\nAgent:', "target": "negative", - "inputs": {}, + "input_fields": {}, "references": ["negative"], } self.assertDictEqual(result, target) @@ -284,7 +310,7 @@ def test_system_format(self): "source": 'This is my sentence: "was so bad"', "target": "negative", "references": ["negative"], - "inputs": {}, + "input_fields": {}, "instruction": "classify user sentence by its sentiment to either positive, or negative.", "demos": [ { @@ -307,11 +333,112 @@ def test_system_format(self): "source": '[INST] <>\nclassify user sentence by its sentiment to either positive, or negative.\n\nUser: This is my sentence: "was so not good"\nAgent: negative\n\nUser: This is my sentence: "was so good"\nAgent: positive\n\nUser: This is my sentence: "was so bad"\nAgent: [/INST]', "target": "negative", "references": ["negative"], - "inputs": {}, + "input_fields": {}, } self.assertDictEqual(result, target) + def test_system_format_with_demos_different_target_prefixes(self): + instances = [ + {"question": "1+1", "answer": "2"}, + {"question": "2+2", "answer": "4"}, + {"question": "3+3", "answer": "6"}, + {"question": "4+4", "answer": "8"}, + {"question": "5+5", "answer": "10"}, + {"question": "6+6", "answer": "12"}, + {"question": "7+7", "answer": "14"}, + {"question": "8+8", "answer": "16"}, + {"question": "9+9", "answer": "18"}, + {"question": "10+10", "answer": "20"}, + ] + + task = Task( + input_fields={"question": str}, + reference_fields={"answer": str}, + prediction_type=str, + metrics=["metrics.accuracy"], + ) + + template = InputOutputTemplate( + input_format="Solve: {question}\nAnswer: ", + output_format="{answer}", + postprocessors=[], + target_prefix="{question} = ", + ) + + card = TaskCard( + loader=LoadFromDictionary(data={"train": instances}), + preprocess_steps=[], + task=task, + templates=[template], + ) + + recipe = StandardRecipe( + card=card, + loader_limit=20, + demos_pool_size=5, + num_demos=2, + template_card_index=0, + system_prompt=TextualSystemPrompt("\nSolve the following exercises.\n "), + ) + ms = recipe() + trains = list(ms["train"]) + + formatted_source = ( + trains[0]["source"] + + "\n\n" + + trains[1]["source"] + + "\n\n" + + trains[2]["source"] + ) + target_formatted_source = ( + "\n" + "Solve the following exercises.\n" + " \n" + "Solve: 4+4\n" + "Answer: \n" + "4+4 = 8\n" + "\n" + "Solve: 3+3\n" + "Answer: \n" + "3+3 = 6\n" + "\n" + "Solve: 6+6\n" + "Answer: \n" + "6+6 = \n" + "\n" + "\n" + "Solve the following exercises.\n" + " \n" + "Solve: 3+3\n" + "Answer: \n" + "3+3 = 6\n" + "\n" + "Solve: 4+4\n" + "Answer: \n" + "4+4 = 8\n" + "\n" + "Solve: 7+7\n" + "Answer: \n" + "7+7 = \n" + "\n" + "\n" + "Solve the following exercises.\n" + " \n" + "Solve: 4+4\n" + "Answer: \n" + "4+4 = 8\n" + "\n" + "Solve: 5+5\n" + "Answer: \n" + "5+5 = 10\n" + "\n" + "Solve: 8+8\n" + "Answer: \n" + "8+8 = " + ) + self.assertEqual(target_formatted_source, formatted_source) + def test_system_format_with_args(self): system_format = SystemFormat( format_args={"input_prefix": "User: ", "output_prefix": "Agent: "}, @@ -323,8 +450,18 @@ def test_system_format_with_args(self): instruction = "solve the math exercises" demo_instances = [ - {"source": "1+2", "target": "3", "instruction": instruction, "inputs": {}}, - {"source": "4-2", "target": "2", "instruction": instruction, "inputs": {}}, + { + "source": "1+2", + "target": "3", + "instruction": instruction, + "input_fields": {}, + }, + { + "source": "4-2", + "target": "2", + "instruction": instruction, + "input_fields": {}, + }, ] inputs = [ @@ -333,50 +470,50 @@ def test_system_format_with_args(self): "target": "2", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "3+2", "target": "5", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "7-4", "target": "3", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, { "source": "12-3", "target": "9", "instruction": instruction, "demos": demo_instances, - "inputs": {}, + "input_fields": {}, }, ] targets = [ { "target": "2", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n1+1\nAgent: ", }, { "target": "5", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n3+2\nAgent: ", }, { "target": "3", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n7-4\nAgent: ", }, { "target": "9", - "inputs": {}, + "input_fields": {}, "source": "User: 1+2\nAgent: 3\n\nUser: 4-2\nAgent: 2\n\nUser: solve the math exercises\n\n12-3\nAgent: ", }, ] diff --git a/tests/library/test_metric_utils.py b/tests/library/test_metric_utils.py index 1f13b43e3..c03ab306c 100644 --- a/tests/library/test_metric_utils.py +++ b/tests/library/test_metric_utils.py @@ -21,12 +21,16 @@ class AvgRougeNoBootstrap(Rouge): def prepare(self): self.n_resamples = None self.rouge_types = ["rougeL"] + self.ci_scores = ["rougeL"] + self.hf_metric_fields = ["rougeL"] + self.reduction_map = {"mean": ["rougeL"]} self.use_aggregator = False super().prepare() - def compute(self, references, predictions, task_data: List[Dict]): - res_list = super().compute(references, predictions, task_data)["rougeL"] - return {"rougeL": nanmean(res_list)} + def compute(self, references, prediction, task_data: List[Dict]): + # single score for a single instance + res = super().compute(references, prediction, task_data)["rougeL"] + return {"rougeL": res} metric = AvgRougeNoBootstrap() references = [ diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 9c5a1991e..323525d03 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -1,4 +1,5 @@ from math import isnan +from typing import Dict, List from unitxt.inference import MockInferenceEngine from unitxt.llm_as_judge import LLMAsJudge @@ -38,9 +39,11 @@ GroupMeanAccuracy, GroupMeanStringContainment, GroupMeanTokenOverlap, + HuggingfaceMetric, KendallTauMetric, LlamaIndexCorrectness, MaxAccuracy, + MetricsEnsemble, NormalizedSacrebleu, Perplexity, PrecisionBinary, @@ -50,7 +53,7 @@ TokenOverlap, UnsortedListExactMatch, ) -from unitxt.test_utils.metrics import apply_metric +from unitxt.test_utils.metrics import apply_metric, check_scores, test_metric from tests.utils import UnitxtTestCase @@ -161,6 +164,18 @@ def test_unsorted_list_exact_match(self): for output, target in zip(outputs, instance_targets): self.assertDictEqual(output["score"]["instance"], target) + def prediction_type_definition(self): + class TempAccuracy(Accuracy): + prediction_type = int + + self.assertEqual(TempAccuracy().prediction_type, int) + + def test_prediction_type_definition_deprecated(self): + class TempAccuracy2(Accuracy): + prediction_type = "int" + + self.assertEqual(TempAccuracy2().prediction_type, int) + def test_accuracy(self): metric = Accuracy() @@ -799,19 +814,54 @@ def test_rouge(self): global_target = 5 / 6 self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) - def test_rouge_l(self): - metric = Rouge( - n_resamples=None, # disable confidence interval calculation which fails for this metric configuration - use_aggregator=False, - rouge_types=["rougeL"], - ) - references = [["hello", "there"], ["general kenobi", "general yoda"]] - predictions = ["hello there", "general kenobi"] + # compare with the HF implementation + class OldRouge(HuggingfaceMetric): + hf_metric_name = "rouge" + main_score = "rougeL" + scale = 1.0 + + prediction_type = "str" + single_reference_per_prediction = False # multiple references allowed + + use_aggregator: bool = True + rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + + sent_split_newline: bool = True + + _requirements_list: List[str] = ["nltk", "rouge_score"] + + def prepare(self): + super().prepare() + + self.hf_compute_args.update( + { + "use_aggregator": self.use_aggregator, + "rouge_types": self.rouge_types, + } + ) + + import nltk + + nltk.download("punkt") + self.sent_tokenize = nltk.sent_tokenize + + def compute(self, references, predictions, task_data: List[Dict]): + if self.sent_split_newline: + predictions = [ + "\n".join(self.sent_tokenize(prediction.strip())) + for prediction in predictions + ] + references = [ + ["\n".join(self.sent_tokenize(r.strip())) for r in reference] + for reference in references + ] + return super().compute(references, predictions, task_data) + + metric = OldRouge() outputs = apply_metric( metric=metric, predictions=predictions, references=references ) - global_target = [2 / 3, 1.0] - self.assertListEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) def test_token_overlap(self): metric = TokenOverlap() @@ -1150,8 +1200,8 @@ def test_perplexity_with_prefix(self): ) expected_global_result = { - "my_perplexity": 0.05986589565873146, - "score": 0.05986589565873146, + "my_perplexity": 0.06, + "score": 0.06, "score_name": "my_perplexity", } @@ -1162,18 +1212,21 @@ def test_perplexity_with_prefix(self): for key, value in global_result.items() if key in expected_global_result } - self.assertDictEqual(global_result, expected_global_result) - instance_targets = [ + expected_instance_results = [ { - "my_perplexity": 0.05986589565873146, - "score": 0.05986589565873146, + "my_perplexity": 0.06, + "score": 0.06, "score_name": "my_perplexity", - "my_reference_scores": [0.05986589565873146], + "my_reference_scores": [0.06], } ] - for output, target in zip(outputs, instance_targets): - self.assertDictEqual(output["score"]["instance"], target) + check_scores( + expected_global_result, + expected_instance_results, + global_outputs=outputs[0]["score"]["global"], + instance_outputs=[outputs[0]["score"]["instance"]], + ) class TestConfidenceIntervals(UnitxtTestCase): @@ -1479,7 +1532,10 @@ def test_llm_as_judge_metric(self): "output": "output", "type_of_output": "type", "source": "input", - "metadata": {"template": "templates.generation.default"}, + "metadata": { + "template": "templates.generation.default", + "data_classification_policy": ["public"], + }, } ] * 3 @@ -1495,6 +1551,28 @@ def test_llm_as_judge_metric(self): metric_label: 1.0, "score_name": metric_label, "score": 1.0, + "judge_raw_input": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Please act as an impartial judge and " + "evaluate the quality of the response " + "provided by an AI assistant to the user " + "question displayed below. Your evaluation " + "should consider factors such as the " + "helpfulness, relevance, accuracy, depth, " + "creativity, and level of detail of the " + "response. Begin your evaluation by " + "providing a short explanation. Be as " + "objective as possible. After providing your " + "explanation, you must rate the response on " + "a scale of 1 to 10 by strictly following " + 'this format: "[[rating]]", for example: ' + '"Rating: [[5]]".\n\n' + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "[Question]\n" + "Given the following type, generate the corresponding type. type: input\n\n\n" + "[The Start of Assistant's Answer]\n" + "[[10]]\n" + "[The End of Assistant's " + "Answer]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", "judge_raw_output": "[[10]]", } ] * 3 @@ -1620,3 +1698,67 @@ def test_fin_qa_eval(self): for i in range(len(actual_scores)): self.assertAlmostEqual(actual_scores[i], target_scores[i]) + + def test_metrics_ensemble(self): + metric = MetricsEnsemble( + main_score="ensemble_score", + metrics=[ + "metrics.precision_micro_multi_label", + "metrics.recall_macro_multi_label", + ], + weights=None, + ) + + predictions = [["A"], ["B"], [""], ["A"]] + references = [[["B", "A"]], [["B"]], [["A"]], [[""]]] + + instance_targets = [ + { + "ensemble_score": 0.75, + "ensemble_0_precision_micro": 1.0, + "ensemble_1_recall_macro": 0.5, + "score": 0.75, + "score_name": "ensemble_score", + }, + { + "ensemble_score": 1.0, + "ensemble_0_precision_micro": 1.0, + "ensemble_1_recall_macro": 1.0, + "score": 1.0, + "score_name": "ensemble_score", + }, + { + "ensemble_score": 0.0, + "ensemble_0_precision_micro": 0.0, + "ensemble_1_recall_macro": 0.0, + "score": 0.0, + "score_name": "ensemble_score", + }, + { + "ensemble_score": 0.0, + "ensemble_0_precision_micro": 0.0, + "ensemble_1_recall_macro": 0.0, + "score": 0.0, + "score_name": "ensemble_score", + }, + ] + + global_target = { + "ensemble_0_precision_micro": 0.5, + "ensemble_0_precision_micro_ci_high": 1.0, + "ensemble_0_precision_micro_ci_low": 0.0, + "ensemble_1_recall_macro": 0.33, + "ensemble_1_recall_macro_ci_high": 0.56, + "ensemble_1_recall_macro_ci_low": 0.0, + "ensemble_score": 0.44, + "score": 0.44, + "score_name": "ensemble_score", + } + + test_metric( + metric=metric, + predictions=predictions, + references=references, + instance_targets=instance_targets, + global_target=global_target, + ) diff --git a/tests/library/test_operators.py b/tests/library/test_operators.py index 6651cfa18..22a3a7ba4 100644 --- a/tests/library/test_operators.py +++ b/tests/library/test_operators.py @@ -2839,10 +2839,13 @@ def test_render_demonstrations(self): instance = { "demos": [ { - "inputs": {"text": "was so not good"}, - "outputs": {"label": "negative"}, + "input_fields": {"text": "was so not good"}, + "reference_fields": {"label": "negative"}, + }, + { + "input_fields": {"text": "was so good"}, + "reference_fields": {"label": "positive"}, }, - {"inputs": {"text": "was so good"}, "outputs": {"label": "positive"}}, ] } @@ -2852,8 +2855,8 @@ def test_render_demonstrations(self): target = { "demos": [ { - "inputs": {"text": "was so not good"}, - "outputs": {"label": "negative"}, + "input_fields": {"text": "was so not good"}, + "reference_fields": {"label": "negative"}, "source": 'This is my sentence: "was so not good"', "target": "negative", "references": ["negative"], @@ -2861,8 +2864,8 @@ def test_render_demonstrations(self): "target_prefix": "", }, { - "inputs": {"text": "was so good"}, - "outputs": {"label": "positive"}, + "input_fields": {"text": "was so good"}, + "reference_fields": {"label": "positive"}, "source": 'This is my sentence: "was so good"', "target": "positive", "references": ["positive"], @@ -2882,12 +2885,12 @@ def test_render_demonstrations_multi_reference(self): instance = { "demos": [ { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, }, { - "inputs": {"text": "who was she?"}, - "outputs": {"answer": ["Shira", "Yael"]}, + "input_fields": {"text": "who was she?"}, + "reference_fields": {"answer": ["Shira", "Yael"]}, }, ] } @@ -2898,8 +2901,8 @@ def test_render_demonstrations_multi_reference(self): target = { "demos": [ { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, "source": "This is my sentence: who was he?", "target": "Dan", "references": ["Dan", "Yossi"], @@ -2907,8 +2910,8 @@ def test_render_demonstrations_multi_reference(self): "target_prefix": "", }, { - "inputs": {"text": "who was she?"}, - "outputs": {"answer": ["Shira", "Yael"]}, + "input_fields": {"text": "who was she?"}, + "reference_fields": {"answer": ["Shira", "Yael"]}, "source": "This is my sentence: who was she?", "target": "Shira", "references": ["Shira", "Yael"], @@ -2925,7 +2928,7 @@ def test_icl_format_with_demonstrations(self): "source": "1+1", "target": "2", "instruction": "solve the math exercises", - "inputs": {}, + "input_fields": {}, } demos_instances = [ {"source": "1+2", "target": "3", "instruction": "solve the math exercises"}, @@ -2964,7 +2967,7 @@ def test_system_format_with_demonstrations_and_instruction_after_demos( instance = { "source": "1+1", "target": "2", - "inputs": {}, + "input_fields": {}, "instruction": "solve the math exercises", "demos": demo_instances, } @@ -2993,7 +2996,7 @@ def test_system_format_without_demonstrations(self): "source": "1+1", "target": "2", "instruction": "solve the math exercises", - "inputs": {}, + "input_fields": {}, } target = """Instruction:solve the math exercises @@ -3011,7 +3014,7 @@ def test_system_format_without_demonstrations(self): self.assertEqual(instance["source"], target) def test_model_input_formatter_without_demonstrations_or_instruction(self): - instance = {"source": "1+1", "target": "2", "inputs": {}} + instance = {"source": "1+1", "target": "2", "input_fields": {}} target = """User:1+1 Agent:""" @@ -3024,7 +3027,12 @@ def test_model_input_formatter_without_demonstrations_or_instruction(self): self.assertEqual(instance_out["source"], target) def test_system_format_without_demonstrations_and_empty_instruction(self): - instance = {"source": "1+1", "target": "2", "instruction": "", "inputs": {}} + instance = { + "source": "1+1", + "target": "2", + "instruction": "", + "input_fields": {}, + } target = """User:1+1 Agent:""" @@ -3084,12 +3092,32 @@ def test_join_streams(self): input_multi_stream = MultiStream( { "questions": [ - {"question": "question_1", "id": "1"}, - {"question": "question_2", "id": "2"}, + { + "question": "question_1", + "id": "1", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, + { + "question": "question_2", + "id": "2", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, ], "answers": [ - {"answer": "answer_1", "id": "1"}, - {"answer": "answer_2", "id": "2"}, + { + "answer": "answer_1", + "id": "1", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, + { + "answer": "answer_2", + "id": "2", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, ], "train": [{"field": "train1"}], } @@ -3107,8 +3135,20 @@ def test_join_streams(self): ) joined_stream = list(output_multi_stream["questions_and_answers"]) expected_joined_stream = [ - {"question": "question_1", "id": "1", "answer": "answer_1"}, - {"question": "question_2", "id": "2", "answer": "answer_2"}, + { + "question": "question_1", + "id": "1", + "answer": "answer_1", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, + { + "question": "question_2", + "id": "2", + "answer": "answer_2", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, ] TestOperators().compare_streams(joined_stream, expected_joined_stream) @@ -3139,7 +3179,13 @@ def test_select_fields(self): input_multi_stream = MultiStream( { "questions": [ - {"question": "question_1", "id_1": "1", "id_2": "1"}, + { + "question": "question_1", + "id_1": "1", + "id_2": "1", + "data_classification_policy": ["public"], + "recipe_metadata": [], + }, ], } ) @@ -3148,5 +3194,12 @@ def test_select_fields(self): ) self.assertListEqual(list(output_multi_stream.keys()), ["questions"]) joined_stream = list(output_multi_stream["questions"]) - expected_joined_stream = [{"question": "question_1", "id_1": "1"}] + expected_joined_stream = [ + { + "question": "question_1", + "id_1": "1", + "data_classification_policy": ["public"], + "recipe_metadata": [], + } + ] TestOperators().compare_streams(joined_stream, expected_joined_stream) diff --git a/tests/library/test_parsing_utils.py b/tests/library/test_parsing_utils.py index 448ebc2ef..97db7bb57 100644 --- a/tests/library/test_parsing_utils.py +++ b/tests/library/test_parsing_utils.py @@ -18,6 +18,18 @@ def test_parse_key_equals_value_string_to_dict_simple_query(self): expected = {"name": "John-Doe", "-age": 30, "--height": 5.8} self.assertEqual(parse_key_equals_value_string_to_dict(query), expected) + # constants: True, False, None + query = "name=John-Doe,-age=30,--height=5.8,wife=None,happy=False,rich=True" + expected = { + "name": "John-Doe", + "-age": 30, + "--height": 5.8, + "wife": None, + "happy": False, + "rich": True, + } + self.assertEqual(parse_key_equals_value_string_to_dict(query), expected) + def test_parse_key_equals_value_string_to_dict_with_spaces(self): query = "first name=Jane Doe, last name=Doe, country=USA, balance=100.50" expected = { diff --git a/tests/library/test_recipe.py b/tests/library/test_recipe.py index a4d336cfc..6b7f2cbc8 100644 --- a/tests/library/test_recipe.py +++ b/tests/library/test_recipe.py @@ -94,7 +94,7 @@ def test_standard_recipe_production_without_demos(self): '"choices": ["yes", "not", "maybe"], ' '"answer": "maybe", ' '"options": [" A", " B", " C"], ' - '"metadata": {"template": "templates.qa.multiple_choice.with_topic.lm_eval_harness"}' + '"metadata": {"data_classification_policy": [], "template": "templates.qa.multiple_choice.with_topic.lm_eval_harness"}' "}", "group": "unitxt", "postprocessors": ["processors.first_character"], @@ -168,7 +168,54 @@ def test_standard_recipe_production_with_demos(self): target = { "metrics": ["metrics.accuracy"], - "source": "<>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n\n\nUser: The following are multiple choice questions (with answers) about marketing.\n\nThe single group within society that is most vulnerable to reference group influence is:\nA. The older consumer who feels somewhat left out of things.\nB. The married women, many of whom feel a need for stability in their lives.\nC. New immigrants who really want to assimilate into their new culture.\nD. Children, who base most of their buying decisions on outside influences.\nAnswer:\nAgent: D\n\nUser: The following are multiple choice questions (with answers) about marketing.\n\n Which of the following is an assumption in Maslow's hierarchy of needs?\nA. Needs are dependent on culture and also on social class.\nB. Lower-level needs must be at least partially satisfied before higher needs can affect behaviour.\nC. Needs are not prioritized or arranged in any particular order.\nD. Satisfied needs are motivators, and new needs emerge when current needs remain unmet.\nAnswer:\nAgent: B\n\nUser: The following are multiple choice questions (with answers) about marketing.\n\nIn an organization, the group of people tasked with buying decisions is referred to as the _______________.\nA. Outsourcing unit.\nB. Procurement centre.\nC. Chief executive unit.\nD. Decision-making unit.\nAnswer:\nAgent: D\n\n\nUser:The following are multiple choice questions (with answers) about testing.\n\nwhat?\nA. yes\nB. not\nC. maybe\nAnswer:\nAgent:", + "source": """<> +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. +<> + + + + +User: The following are multiple choice questions (with answers) about marketing. + +Although the content and quality can be as controlled as direct mail, response rates of this medium are lower because of the lack of a personal address mechanism. This media format is known as: +A. Care lines. +B. Direct mail. +C. Inserts. +D. Door to door. +Answer: +Agent: D + +User: The following are multiple choice questions (with answers) about marketing. + + _____________ is a natural outcome when combining demographic and geographic variables. +A. Geodemographics +B. Product differentiation. +C. ANSOFF matrix. +D. Brand management. +Answer: +Agent: A + +User: The following are multiple choice questions (with answers) about marketing. + +In an organization, the group of people tasked with buying decisions is referred to as the _______________. +A. Outsourcing unit. +B. Procurement centre. +C. Chief executive unit. +D. Decision-making unit. +Answer: +Agent: D + + +User:The following are multiple choice questions (with answers) about testing. + +what? +A. yes +B. not +C. maybe +Answer: +Agent:""", "target": " C", "references": [" C"], "task_data": '{"topic": "testing",' @@ -176,7 +223,7 @@ def test_standard_recipe_production_with_demos(self): ' "choices": ["yes", "not", "maybe"],' ' "answer": "maybe",' ' "options": [" A", " B", " C"],' - ' "metadata": {"template": "templates.qa.multiple_choice.with_topic.lm_eval_harness"}' + ' "metadata": {"data_classification_policy": [], "template": "templates.qa.multiple_choice.with_topic.lm_eval_harness"}' "}", "group": "unitxt", "postprocessors": ["processors.first_character"], @@ -544,30 +591,6 @@ def test_recipe_loaded_from_arguments_and_overwrites_only(self): first_inst = next(iterator) self.assertListEqual(["metrics.accuracy"], first_inst["metrics"]) - def test_standard_recipe_with_a_sampler(self): - """Check that the sampler is re-initialized before processing a recipe. - - To do so, save the random generator within the sampler before activating the recipe, - and compare it to the random generator within the sampler after the revipe was called. - The two generators should be different objects, indicating that the sampler was properly - re-initialized during the preparation of the recipe. - """ - recipe = StandardRecipeWithIndexes( - card="cards.sst2", - template_card_index=0, - max_train_instances=0, - max_test_instances=2, - num_demos=1, - demos_pool_size=10, - ) - sampler = recipe.card.sampler - - random_generator1 = sampler.random_generator - recipe() - random_generator2 = sampler.random_generator - - self.assertNotEqual(random_generator1, random_generator2) - def test_standard_recipe_with_a_missing_sampler(self): """Check that initializing a recipe with a card that does not have a sampler raises an exception.""" task_card, _ = copy.deepcopy(fetch_artifact("cards.sst2")) diff --git a/tests/library/test_splitters.py b/tests/library/test_splitters.py index bac1943f9..c4833bc83 100644 --- a/tests/library/test_splitters.py +++ b/tests/library/test_splitters.py @@ -1,6 +1,10 @@ import copy -from unitxt.splitters import DiverseLabelsSampler +from unitxt.api import load_dataset +from unitxt.blocks import TaskCard +from unitxt.collections_operators import Wrap +from unitxt.loaders import LoadFromDictionary +from unitxt.splitters import CloseTextSampler, DiverseLabelsSampler, FixedIndicesSampler from tests.utils import UnitxtTestCase @@ -35,7 +39,10 @@ def test_sample(self): self.new_exemplar(choices, ["cow"], "Moo1"), self.new_exemplar(choices, ["duck"], "Quack"), ] - result = sampler.sample(instances) + result = sampler.sample( + instances, + self.new_exemplar(choices, ["any"], "any"), + ) from collections import Counter @@ -59,7 +66,10 @@ def test_sample_no_empty_labels(self): self.new_exemplar(choices, ["cow"], "Moo1"), self.new_exemplar(choices, ["duck"], "Quack"), ] - result = sampler.sample(instances) + result = sampler.sample( + instances, + self.new_exemplar(choices, ["any"], "any"), + ) from collections import Counter @@ -79,7 +89,9 @@ def test_sample_list(self): self.new_exemplar(choices, ["dog"], "Bark2"), self.new_exemplar(choices, ["duck"], "Quack"), ] - result = sampler.sample(instances) + result = sampler.sample( + instances, self.new_exemplar(choices, ["any"], "any") + ) from collections import Counter counts = Counter() @@ -146,3 +158,140 @@ def test_filter_with_bad_input(self): f"'input_fields' field is missing from '{instance}'.", str(cm.exception), ) + + +class TestCloseTextSampler(UnitxtTestCase): + """Tests for the CloseTextSampler object.""" + + @staticmethod + def new_exemplar(question: str, answer: str): + """Return an exemplar in a correct format.""" + return { + "input_fields": {"question": question, "answer": answer}, + } + + def test_sample(self): + instances = [ + self.new_exemplar("What is your name?", "John"), + self.new_exemplar("In which country is Paris located?", "France"), + self.new_exemplar("What's the time?", "22:00"), + self.new_exemplar("What is your name, please?", "Mary"), + ] + + num_samples = 2 + sampler = CloseTextSampler(num_samples, field="question") + + results = sampler.sample( + instances, self.new_exemplar("What's your name?", "don't know") + ) + self.assertEqual(results, [instances[0], instances[3]]) + + results = sampler.sample( + instances, self.new_exemplar("What is the time?", "don't know") + ) + self.assertEqual(results, [instances[2], instances[0]]) + + num_samples = 1 + sampler = CloseTextSampler(num_samples, field="answer") + results = sampler.sample( + instances, self.new_exemplar("Who do I love?", "Mary Lu") + ) + self.assertEqual(results, [instances[3]]) + + def test_filter_with_wrong_field(self): + num_samples = 2 + sampler = CloseTextSampler(num_samples, field="wrong_field") + instances = [ + self.new_exemplar("What is your name?", "John"), + ] + instance = self.new_exemplar("What's your name?", "don't know") + with self.assertRaises(ValueError) as cm: + sampler.sample(instances, instance) + self.assertIn( + 'query "input_fields/wrong_field" did not match any item in dict', + str(cm.exception), + ) + + def test_end2end(self): + data = { + "train": [ + {"question": "What is your name?", "answer": "John"}, + {"question": "In which country is Paris located?", "answer": "France"}, + {"question": "At what time do we they eat dinner?", "answer": "22:00"}, + {"question": "What's your name, please?", "answer": "Mary"}, + {"question": "Is this your car?", "answer": "yes"}, + {"question": "What is your name?", "answer": "Sunny"}, + ], + "test": [ + {"question": "What's your name?", "answer": "John"}, + ], + } + + card = TaskCard( + loader=LoadFromDictionary(data=data), + task="tasks.qa.open", + preprocess_steps=[Wrap(field="answer", inside="list", to_field="answers")], + ) + + dataset = load_dataset( + card=card, + template="templates.qa.open.title", + demos_pool_size=5, + num_demos=2, + sampler=CloseTextSampler(field="question"), + ) + expected_output = """Answer the question. +Question: +What is your name? +Answer: +John + +Question: +What's your name, please? +Answer: +Mary + +Question: +What's your name? +Answer: +""" + self.assertEqual(dataset["test"][0]["source"], expected_output) + + +class TestFixedIndicesSampler(UnitxtTestCase): + """Tests for the FixedIndicesSampler object.""" + + @staticmethod + def new_exemplar(question: str, answer: str): + """Return an exemplar in a correct format.""" + return { + "input_fields": {"question": question, "answer": answer}, + } + + def test_sample(self): + instances = [ + self.new_exemplar("What is your name?", "John"), + self.new_exemplar("In which country is Paris located?", "France"), + self.new_exemplar("What's the time?", "22:00"), + self.new_exemplar("What is your name, please?", "Mary"), + ] + instance = self.new_exemplar("What's your name?", "don't know") + sampler = FixedIndicesSampler(indices=[2, 0]) + + results = sampler.sample(instances, instance) + self.assertEqual(results, [instances[2], instances[0]]) + + def test_out_of_bound_sample(self): + instances = [ + self.new_exemplar("What is your name?", "John"), + self.new_exemplar("In which country is Paris located?", "France"), + ] + + instance = self.new_exemplar("What's your name?", "don't know") + sampler = FixedIndicesSampler(indices=[2]) + with self.assertRaises(ValueError) as cm: + sampler.sample(instances, instance) + self.assertIn( + "FixedIndicesSampler 'indices' field contains index (2) which is out of bounds of the instance pool ( of size 2)", + str(cm.exception), + ) diff --git a/tests/library/test_tasks.py b/tests/library/test_tasks.py index c0dc477b4..ee931adb0 100644 --- a/tests/library/test_tasks.py +++ b/tests/library/test_tasks.py @@ -1,3 +1,5 @@ +from typing import Any, Dict, List + from unitxt.task import Task from tests.utils import UnitxtTestCase @@ -6,15 +8,15 @@ class TestTasks(UnitxtTestCase): def test_task_metrics_type_checking(self): operator = Task( - input_fields={"input": "str"}, - reference_fields={"label": "str"}, - prediction_type="str", + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) operator.check_metrics_type() - operator.prediction_type = "Dict" + operator.prediction_type = Dict with self.assertRaises(ValueError) as e: operator.check_metrics_type() self.assertEqual( @@ -25,20 +27,20 @@ def test_task_metrics_type_checking(self): def test_task_metrics_type_checking_with_inputs_outputs(self): operator = Task( - inputs={"input": "str"}, - outputs={"label": "str"}, - prediction_type="str", + inputs={"input": str}, + outputs={"label": str}, + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) operator.check_metrics_type() - operator.prediction_type = "Dict" + operator.prediction_type = Dict[int, int] with self.assertRaises(ValueError) as e: operator.check_metrics_type() self.assertEqual( str(e.exception), - "The task's prediction type (typing.Dict) and 'metrics.wer' metric's prediction type " + "The task's prediction type (typing.Dict[int, int]) and 'metrics.wer' metric's prediction type " "() are different.", ) @@ -46,8 +48,8 @@ def test_task_missing_input_fields(self): with self.assertRaises(ValueError) as e: Task( input_fields=None, - reference_fields={"label": "str"}, - prediction_type="str", + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) self.assertEqual( @@ -57,9 +59,9 @@ def test_task_missing_input_fields(self): def test_task_missing_reference_fields(self): with self.assertRaises(ValueError) as e: Task( - input_fields={"input": "int"}, + input_fields={"input": int}, reference_fields=None, - prediction_type="str", + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) self.assertEqual( @@ -69,10 +71,10 @@ def test_task_missing_reference_fields(self): def test_conflicting_input_fields(self): with self.assertRaises(ValueError) as e: Task( - inputs={"input": "int"}, - input_fields={"input": "int"}, - reference_fields={"label": "str"}, - prediction_type="str", + inputs={"input": int}, + input_fields={"input": int}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) self.assertEqual( @@ -83,10 +85,10 @@ def test_conflicting_input_fields(self): def test_conflicting_output_fields(self): with self.assertRaises(ValueError) as e: Task( - input_fields={"input": "int"}, - reference_fields={"label": "str"}, - outputs={"label": "int"}, - prediction_type="str", + input_fields={"input": int}, + reference_fields={"label": str}, + outputs={"label": int}, + prediction_type=str, metrics=["metrics.wer", "metrics.rouge"], ) self.assertEqual( @@ -101,9 +103,9 @@ def test_set_defaults(self): ] operator = Task( - input_fields={"input": "str", "input_type": "str"}, - reference_fields={"label": "int", "labels": "List[int]"}, - prediction_type="Any", + input_fields={"input": str, "input_type": str}, + reference_fields={"label": int, "labels": List[int]}, + prediction_type=Any, metrics=["metrics.accuracy"], defaults={"input_type": "text", "labels": [0, 1, 2]}, ) @@ -130,6 +132,43 @@ def test_set_defaults(self): ) def test_verify_defaults(self): + operator = Task( + input_fields={"input": str}, + reference_fields={"label": int}, + prediction_type=Any, + metrics=["metrics.accuracy"], + ) + + default_name = "input_type" + operator.defaults = {"input_type": "text"} + with self.assertRaises(AssertionError) as e: + operator.verify_defaults() + self.assertEqual( + str(e.exception), + f"If specified, all keys of the 'defaults' must refer to a chosen " + f"key in either 'input_fields' or 'reference_fields'. However, the name '{default_name}' " + f"was provided which does not match any of the keys.", + ) + + operator.defaults = {"label": "LABEL"} + with self.assertRaises(AssertionError) as e: + operator.verify_defaults() + self.assertEqual( + str(e.exception), + "The value of 'label' from the 'defaults' must be of " + "type 'int', however, it is of type 'str'.", + ) + + operator.defaults = {"label": "LABEL"} + with self.assertRaises(AssertionError) as e: + operator.verify_defaults() + self.assertEqual( + str(e.exception), + "The value of 'label' from the 'defaults' must be of " + "type 'int', however, it is of type 'str'.", + ) + + def test_verify_defaults_string_type(self): operator = Task( input_fields={"input": "str"}, reference_fields={"label": "int"}, @@ -149,12 +188,11 @@ def test_verify_defaults(self): ) default_name = "label" - val_type = "int" operator.defaults = {"label": "LABEL"} with self.assertRaises(AssertionError) as e: operator.verify_defaults() self.assertEqual( str(e.exception), - f"The value of '{default_name}' from the 'defaults' must be of " - f"type '{val_type}', however, it is of type '{type(operator.defaults[default_name])}'.", + "The value of 'label' from the 'defaults' must be of " + "type 'int', however, it is of type 'str'.", ) diff --git a/tests/library/test_templates.py b/tests/library/test_templates.py index d3fcb6a25..9179d3870 100644 --- a/tests/library/test_templates.py +++ b/tests/library/test_templates.py @@ -27,8 +27,10 @@ def test_span_labeling_template_escaping(self): inputs = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "LOC", "ORG"], @@ -36,10 +38,10 @@ def test_span_labeling_template_escaping(self): }, }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -50,8 +52,10 @@ def test_span_labeling_template_escaping(self): targets = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "LOC", "ORG"], @@ -64,10 +68,10 @@ def test_span_labeling_template_escaping(self): "target_prefix": "", }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -88,19 +92,19 @@ def test_multi_label_template(self): inputs = [ { - "inputs": {"text": "hello world"}, - "outputs": {"labels": ["cat", "dog"]}, + "input_fields": {"text": "hello world"}, + "reference_fields": {"labels": ["cat", "dog"]}, }, { - "inputs": {"text": "hello world"}, - "outputs": {"labels": ["man", "woman", "dog"]}, + "input_fields": {"text": "hello world"}, + "reference_fields": {"labels": ["man", "woman", "dog"]}, }, ] targets = [ { - "inputs": {"text": "hello world"}, - "outputs": {"labels": ["cat", "dog"]}, + "input_fields": {"text": "hello world"}, + "reference_fields": {"labels": ["cat", "dog"]}, "source": "hello world", "target": "cat, dog", "references": ["cat, dog"], @@ -108,8 +112,8 @@ def test_multi_label_template(self): "target_prefix": "", }, { - "inputs": {"text": "hello world"}, - "outputs": {"labels": ["man", "woman", "dog"]}, + "input_fields": {"text": "hello world"}, + "reference_fields": {"labels": ["man", "woman", "dog"]}, "source": "hello world", "target": "man, woman, dog", "references": ["man, woman, dog"], @@ -129,15 +133,15 @@ def _test_multi_reference_template(self, target, random_reference): inputs = [ { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, } ] targets = [ { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, "source": "This is my sentence: who was he?", "target": target, "references": ["Dan", "Yossi"], @@ -161,8 +165,8 @@ def _test_multi_reference_template_with_exception( input_format="This is my sentence: {text}", references_field="answer" ) instance = { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": references}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": references}, } with self.assertRaises(ValueError) as e: @@ -191,29 +195,35 @@ def test_input_output_template_and_standard_template(self): inputs = [ { - "inputs": {"labels": ["positive", "negative"], "text": "hello world"}, - "outputs": {"label": "positive"}, + "input_fields": { + "labels": ["positive", "negative"], + "text": "hello world", + }, + "reference_fields": {"label": "positive"}, }, { - "inputs": { + "input_fields": { "labels": ["positive", "negative"], "text": ["hello world\n", "hell"], }, - "outputs": {"label": "positive"}, + "reference_fields": {"label": "positive"}, }, { - "inputs": { + "input_fields": { "labels": ["positive", "negative"], "text": ["hello world\n", "hell"], }, - "outputs": {"label": ["positive", "1"]}, + "reference_fields": {"label": ["positive", "1"]}, }, ] targets = [ { - "inputs": {"labels": ["positive", "negative"], "text": "hello world"}, - "outputs": {"label": "positive"}, + "input_fields": { + "labels": ["positive", "negative"], + "text": "hello world", + }, + "reference_fields": {"label": "positive"}, "source": "This is my text:'hello world'", "target": "positive", "references": ["positive"], @@ -221,11 +231,11 @@ def test_input_output_template_and_standard_template(self): "target_prefix": "Sentiment is: ", }, { - "inputs": { + "input_fields": { "labels": ["positive", "negative"], "text": ["hello world\n", "hell"], }, - "outputs": {"label": "positive"}, + "reference_fields": {"label": "positive"}, "source": "This is my text:'hello world\n, hell'", "target": "positive", "references": ["positive"], @@ -233,11 +243,11 @@ def test_input_output_template_and_standard_template(self): "target_prefix": "Sentiment is: ", }, { - "inputs": { + "input_fields": { "labels": ["positive", "negative"], "text": ["hello world\n", "hell"], }, - "outputs": {"label": ["positive", "1"]}, + "reference_fields": {"label": ["positive", "1"]}, "source": "This is my text:'hello world\n, hell'", "target": "positive, 1", "references": ["positive, 1"], @@ -261,7 +271,7 @@ def test_input_output_template_and_standard_template(self): with self.assertRaises(TemplateFormatKeyError) as ke: err_input_template.process(inputs[0]) self.assertEqual( - "\"Available inputs are [labels, text] but InputOutputTemplate.input_format format requires a different ones: 'This is my text:'{no_text}''\"", + "\"Available input fields are [labels, text] but InputOutputTemplate.input_format format requires a different ones: 'This is my text:'{no_text}''\"", str(ke.exception), ) @@ -271,7 +281,7 @@ def test_input_output_template_and_standard_template(self): with self.assertRaises(TemplateFormatKeyError) as ke: err_output_template.process(inputs[0]) self.assertEqual( - "\"Available outputs are [label] but InputOutputTemplate.output_format format requires a different ones: '{no_label}'\"", + "\"Available reference fields are [label] but InputOutputTemplate.output_format format requires a different ones: '{no_label}'\"", str(ke.exception), ) @@ -286,15 +296,21 @@ def test_input_output_reference_template_and_standard_template(self): inputs = [ { - "inputs": {"labels": ["positive", "negative"], "text": "hello world"}, - "outputs": {"label": "positive", "reference": "1"}, + "input_fields": { + "labels": ["positive", "negative"], + "text": "hello world", + }, + "reference_fields": {"label": "positive", "reference": "1"}, }, ] targets = [ { - "inputs": {"labels": ["positive", "negative"], "text": "hello world"}, - "outputs": {"label": "positive", "reference": "1"}, + "input_fields": { + "labels": ["positive", "negative"], + "text": "hello world", + }, + "reference_fields": {"label": "positive", "reference": "1"}, "source": "This is my text:'hello world'", "target": "positive", "references": ["1"], @@ -306,23 +322,25 @@ def test_input_output_reference_template_and_standard_template(self): check_operator(template, inputs, targets, tester=self) with self.assertRaises(KeyError): - template.outputs_to_target_and_references( - outputs={"label": "positive", "references": "1"} + template.reference_fields_to_target_and_references( + reference_fields={"label": "positive", "references": "1"} ) class ToCoverTemplate(Template): - def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]: - ret = super().inputs_to_source(inputs) + def input_fields_to_source( + self, inputs: Dict[str, object] + ) -> Tuple[str, str]: + ret = super().input_fields_to_source(inputs) return (ret, ret) - def outputs_to_target_and_references( + def reference_fields_to_target_and_references( self, outputs: Dict[str, object] ) -> Tuple[str, List[str]]: - return super().outputs_to_target_and_references(outputs) + return super().reference_fields_to_target_and_references(outputs) to_cover_template = ToCoverTemplate() - to_cover_template.inputs_to_source({"a": 1}) - to_cover_template.outputs_to_target_and_references({"a": 1}) + to_cover_template.input_fields_to_source({"a": 1}) + to_cover_template.reference_fields_to_target_and_references({"a": 1}) class ToCoverTemplatesDict(TemplatesDict): def verify(self): @@ -344,7 +362,7 @@ def test_yes_no_template_process_input(self): "Is text_b of news?": {"text": "text_b", "class": "news"}, } for expected_processed_input, inputs in processed_input_to_inputs.items(): - processed = template.inputs_to_source(inputs) + processed = template.input_fields_to_source(inputs) self.assertEqual(expected_processed_input, processed) def test_yes_no_template_process_input_missing_input_field(self): @@ -355,9 +373,9 @@ def test_yes_no_template_process_input_missing_input_field(self): ) with self.assertRaises(TemplateFormatKeyError) as cm: wrong_field_name = "wrong_field_name" - template.inputs_to_source(inputs={wrong_field_name: ["news"]}) + template.input_fields_to_source(input_fields={wrong_field_name: ["news"]}) self.assertEqual( - "\"Available inputs are [wrong_field_name] but YesNoTemplate.input_format format requires a different ones: 'Expecting field {class} in input.'\"", + "\"Available input fields are [wrong_field_name] but YesNoTemplate.input_format format requires a different ones: 'Expecting field {class} in input.'\"", str(cm.exception), ) @@ -380,7 +398,9 @@ def test_yes_no_template_process_output(self): yes_answer: {label_field: ["news", "sports"], class_field: "news"}, } for expected_processed_output, outputs in processed_output_to_outputs.items(): - processed, references = template.outputs_to_target_and_references(outputs) + processed, references = template.reference_fields_to_target_and_references( + outputs + ) self.assertEqual(expected_processed_output, processed) self.assertEqual(references, [expected_processed_output]) @@ -397,17 +417,17 @@ def test_yes_no_template_process_output_missing_fields(self): with self.assertRaises(RuntimeError) as cm: outputs = {class_field: "news"} - template.outputs_to_target_and_references(outputs=outputs) + template.reference_fields_to_target_and_references(reference_fields=outputs) self.assertEqual( - f"Available outputs are {list(outputs.keys())}, missing required label field: '{label_field}'.", + f"Available reference_fields are {list(outputs.keys())}, missing required label field: '{label_field}'.", str(cm.exception), ) with self.assertRaises(RuntimeError) as cm: outputs = {label_field: ["news", "sports"]} - template.outputs_to_target_and_references(outputs=outputs) + template.reference_fields_to_target_and_references(reference_fields=outputs) self.assertEqual( - f"Available outputs are {list(outputs.keys())}, missing required class field: '{class_field}'.", + f"Available reference_fields are {list(outputs.keys())}, missing required class field: '{class_field}'.", str(cm.exception), ) @@ -419,8 +439,8 @@ def _test_with_wrong_labels_value(wrong_labels_value): input_format="", class_field="", label_field="labels" ) with self.assertRaises(RuntimeError) as cm: - template.outputs_to_target_and_references( - outputs={"labels": wrong_labels_value} + template.reference_fields_to_target_and_references( + reference_fields={"labels": wrong_labels_value} ) self.assertEqual( f"Unexpected value for gold_class_names: '{wrong_labels_value}'. Expecting a list.", @@ -439,8 +459,8 @@ def _test_with_wrong_class_value(wrong_class_value): input_format="", class_field=class_field, label_field=label_field ) with self.assertRaises(RuntimeError) as cm: - template.outputs_to_target_and_references( - outputs={ + template.reference_fields_to_target_and_references( + reference_fields={ label_field: ["news"], class_field: wrong_class_value, } @@ -462,8 +482,10 @@ def test_span_labeling_template_one_entity_escaping(self): inputs = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "PER", "ORG"], @@ -471,10 +493,10 @@ def test_span_labeling_template_one_entity_escaping(self): }, }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -485,8 +507,10 @@ def test_span_labeling_template_one_entity_escaping(self): targets = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "PER", "ORG"], @@ -499,10 +523,10 @@ def test_span_labeling_template_one_entity_escaping(self): "target_prefix": "", }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -523,8 +547,10 @@ def test_span_labeling_json_template(self): inputs = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "PER", "ORG"], @@ -532,10 +558,10 @@ def test_span_labeling_json_template(self): }, }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -546,8 +572,10 @@ def test_span_labeling_json_template(self): targets = [ { - "inputs": {"text": "John,: Doe is from New York and works at Goo:gle."}, - "outputs": { + "input_fields": { + "text": "John,: Doe is from New York and works at Goo:gle." + }, + "reference_fields": { "spans_starts": [0, 19, 41], "spans_ends": [10, 27, 48], "labels": ["PER", "PER", "ORG"], @@ -562,10 +590,10 @@ def test_span_labeling_json_template(self): "target_prefix": "", }, { - "inputs": { + "input_fields": { "text": "John,: Doe is from New York and works at Goo:gle.", }, - "outputs": { + "reference_fields": { "spans_starts": [], "spans_ends": [], "labels": [], @@ -662,7 +690,7 @@ def test_multiple_choice_template(self): with self.assertRaises(ValueError) as ve: check_operator(template, inputs, targets, tester=self) self.assertEqual( - "Error processing instance '0' from stream 'test' in MultipleChoiceTemplate due to: \"Available inputs are [numerals, choices, text] but MultipleChoiceTemplate.input_format format requires a different ones: 'Text: {no_text}, Choices: {no_choices}.'\"", + "Error processing instance '0' from stream 'test' in MultipleChoiceTemplate due to: \"Available input fields are [numerals, choices, text] but MultipleChoiceTemplate.input_format format requires a different ones: 'Text: {no_text}, Choices: {no_choices}.'\"", str(ve.exception), ) @@ -751,7 +779,7 @@ def test_multiple_choice_template_with_shuffle(self): with self.assertRaises(ValueError) as ve: check_operator(template, inputs, targets, tester=self) self.assertEqual( - "Error processing instance '0' from stream 'test' in MultipleChoiceTemplate due to: \"Available inputs are [numerals, choices, text] but MultipleChoiceTemplate.input_format format requires a different ones: 'Text: {no_text}, Choices: {no_choices}.'\"", + "Error processing instance '0' from stream 'test' in MultipleChoiceTemplate due to: \"Available input fields are [numerals, choices, text] but MultipleChoiceTemplate.input_format format requires a different ones: 'Text: {no_text}, Choices: {no_choices}.'\"", str(ve.exception), ) @@ -780,15 +808,18 @@ def test_key_val_template_int_list(self): self.assertEqual(result, target) def test_render_template(self): - instance = {"inputs": {"text": "was so bad"}, "outputs": {"label": "negative"}} + instance = { + "input_fields": {"text": "was so bad"}, + "reference_fields": {"label": "negative"}, + } template = InputOutputTemplate( input_format='This is my sentence: "{text}"', output_format="{label}" ) result = template.process(instance) target = { - "inputs": {"text": "was so bad"}, - "outputs": {"label": "negative"}, + "input_fields": {"text": "was so bad"}, + "reference_fields": {"label": "negative"}, "source": 'This is my sentence: "was so bad"', "target": "negative", "references": ["negative"], @@ -802,14 +833,14 @@ def test_render_multi_reference_template(self): input_format="This is my sentence: {text}", references_field="answer" ) instance = { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, } result = template.process(instance) target = { - "inputs": {"text": "who was he?"}, - "outputs": {"answer": ["Dan", "Yossi"]}, + "input_fields": {"text": "who was he?"}, + "reference_fields": {"answer": ["Dan", "Yossi"]}, "source": "This is my sentence: who was he?", "target": "Dan", "references": ["Dan", "Yossi"], diff --git a/tests/library/test_type_utils.py b/tests/library/test_type_utils.py index 60a265f63..e8584f9b9 100644 --- a/tests/library/test_type_utils.py +++ b/tests/library/test_type_utils.py @@ -1,9 +1,11 @@ import typing from unitxt.type_utils import ( + UnsupportedTypeError, format_type_string, infer_type, infer_type_string, + is_type, isoftype, issubtype, parse_type_string, @@ -267,9 +269,9 @@ def test_parse_malformed_string(self): def test_verify_required_schema(self): schema = { - "field_1": "Dict[str, float]", - "field_2": "int", - "field_3": "Tuple[List[str], Optional[str]]", + "field_1": typing.Dict[str, float], + "field_2": int, + "field_3": typing.Tuple[typing.List[str], typing.Optional[str]], } obj = { @@ -342,3 +344,25 @@ def test_format_type_string(self): "Union[List[Union[int,float]],Tuple[Union[int,float]]]", format_type_string("List[int|float]|Tuple[int|float]"), ) + + def test_is_type(self): + self.assertTrue(is_type(typing.Dict[str, str])) + self.assertTrue(is_type(typing.List[str])) + self.assertTrue(is_type(typing.Tuple[str, str])) + self.assertTrue(is_type(typing.Union[str, int])) + self.assertTrue(is_type(typing.Optional[str])) + self.assertTrue(is_type(str)) + self.assertTrue(is_type(float)) + self.assertTrue(is_type(int)) + self.assertTrue(is_type(list)) + self.assertTrue(is_type(dict)) + self.assertFalse(is_type([1, 2])) + + with self.assertRaises(UnsupportedTypeError): + isoftype(4, (int, int)) + + with self.assertRaises(UnsupportedTypeError): + isoftype(3, "int") + + with self.assertRaises(UnsupportedTypeError): + isoftype(3, typing.List) diff --git a/tests/utils.py b/tests/utils.py index 0fc27b732..8a6e34efe 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -33,7 +33,7 @@ def setUpClass(cls): unitxt.settings.allow_unverified_code = True unitxt.settings.use_only_local_catalogs = True # unitxt.settings.global_loader_limit = 300 - unitxt.settings.max_log_message_size = 1000 + unitxt.settings.max_log_message_size = 1000000000000 if settings.default_verbosity in ["error", "critical"]: if not sys.warnoptions: warnings.simplefilter("ignore")