IBM · benjaminsznajder · Jul 29, 2024 · Jul 17, 2024 · Jul 19, 2024 · Jul 21, 2024
diff --git a/.github/workflows/catalog_consistency.yml b/.github/workflows/catalog_consistency.yml
@@ -25,7 +25,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: '3.9'
-         
+
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system -e ".[tests]"
 

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -26,7 +26,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: '3.9'
-         
+
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system ".[tests]"
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,10 +5,16 @@ repos:
     # Ruff version.
     rev: v0.1.6
     hooks:
-      # Run the linter.
+      # Run the linter on all files except the specific one
       - id: ruff
-        args: [ --fix ]
-      # Run the formatter.
+        args: [--fix]
+        exclude: src/unitxt/metrics.py
+      # Run the linter on the specific file with the ignore flag
+      - id: ruff
+        name: ruff (src/unitxt/metrics.py)
+        files: src/unitxt/metrics.py
+        args: [--fix, --ignore, C901]
+      # Run the formatter
       - id: ruff-format
 
   - repo: https://github.com/ibm/detect-secrets

diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2024-07-09T07:07:12Z",
+  "generated_at": "2024-07-29T09:03:34Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -82,7 +82,7 @@
         "hashed_secret": "fa172616e9af3d2a24b5597f264eab963fe76889",
         "is_secret": false,
         "is_verified": false,
-        "line_number": 1531,
+        "line_number": 1841,
         "type": "Hex High Entropy String",
         "verified_result": null
       }

diff --git a/docs/docs/adding_dataset.rst b/docs/docs/adding_dataset.rst
@@ -2,13 +2,13 @@
 
 .. note::
 
-   To use this tutorial, you need to :ref:`install unitxt <install_unitxt>`.
+   To use this tutorial, you need to :ref:`install Unitxt <install_unitxt>`.
 
 =================
 Datasets ✨
 =================
 
-This guide will assist you in adding or using your new dataset in unitxt.
+This guide will assist you in adding or using your new dataset in Unitxt.
 
 The information needed for loading your data will be defined in  :class:`TaskCard <unitxt.card.TaskCard>` class:
 
@@ -46,9 +46,9 @@ We will use the `bleu` metric for a reference based evaluation.
 .. code-block:: python
 
     task=Task(
-        input_fields= { "text" : "str", "source_language" : "str", "target_language" : "str"},
-        reference_fields= {"translation" : "str"},
-        prediction_type="str",
+        input_fields= { "text" : str, "source_language" : str, "target_language" : str},
+        reference_fields= {"translation" : str},
+        prediction_type=str,
         metrics=["metrics.bleu"],
     ),
 

diff --git a/docs/docs/adding_metric.rst b/docs/docs/adding_metric.rst
@@ -18,17 +18,18 @@ You specify the metrics metrics in the Task.
 For example:
 
    .. code-block:: python
-   task = Task(
-        input_fields={ "question" : "str" },
-        reference_fields={ "answer" : str },
-        prediction_type="str",
-        metrics=[
-            "metrics.rouge",
-            "metrics.normalized_sacrebleu",
-            "metrics.bert_score.deberta_xlarge_mnli",
-            "metrics.bert_score.deberta_large_mnli"
-         ],
-   )
+
+    task = Task(
+            input_fields={"question" : str},
+            reference_fields={"answer" : str},
+            prediction_type=str,
+            metrics=[
+                "metrics.rouge",
+                "metrics.normalized_sacrebleu",
+                "metrics.bert_score.deberta_xlarge_mnli",
+                "metrics.bert_score.deberta_large_mnli"
+            ],
+    )
 
 You can see the full list of built in metrics  :ref:`Metrics section <catalog.tasks>`.
 In this section we will understand Unitxt metrics and learn how to add new metrics.
@@ -49,8 +50,8 @@ string class names as predictions.   The post processor may convert the string o
 (e.g. by splitting using a separator).
 
 2. **References** (`references` - optional):  This is a list of gold references, from the same type of the prediction.
-For example, if the prediction is a string, the references field are a list of strings.  If the prediction is 
-a list of strings (e.g in multi-label classification), then the references field is a *list* of lists of strings.  
+For example, if the prediction is a string, the references field are a list of strings.  If the prediction is
+a list of strings (e.g in multi-label classification), then the references field is a *list* of lists of strings.
 The metric should return a perfect score, if the prediction is equal to one of the references.
 
 3. **Task data** (`task_data` - optional) - all the input and output fields of a task as a dictionary.
@@ -72,8 +73,8 @@ Metric Outputs
 By default, each metric provides scores for each instance separately and global aggregated scores over all instances together.
 The output of the metrics is a nested dictionary per instance.
 
-The scores calculated on instance `i` by itself are found in `results[i]["score"]["instance"]`. 
-The global scores calculated over all instances are found in `results[i]["score"]["global"]`. 
+The scores calculated on instance `i` by itself are found in `results[i]["score"]["instance"]`.
+The global scores calculated over all instances are found in `results[i]["score"]["global"]`.
 Note the global scores are the same in all instances, so usually `results[0]["score"]["global"]` is used to get the global scores.
 
 A metric could return multiple scores, but it should always return a field called `score` with the main score of the metric,
@@ -92,8 +93,8 @@ For example, the score list for an instance could be:
 The global scores are calculated over all instances.
 
 Metrics can also calculate confidence intervals for the global scores.
-This gives you an assessment of the inherient noise in the scores.  When you compare runs on same data, check if their confidence 
-intervals overlap. If so, the difference may not be statistically significant. 
+This gives you an assessment of the inherient noise in the scores.  When you compare runs on same data, check if their confidence
+intervals overlap. If so, the difference may not be statistically significant.
 
     .. code-block:: python
 
@@ -111,7 +112,7 @@ Metric Outputs with Multiple Metrics
 -------------------------------------
 
 When multiple metrics are specified, their scores are appended to the score list.
-If multiple metrics have the same score names, the score of the metric that appears first in the metrics list has precedence. 
+If multiple metrics have the same score names, the score of the metric that appears first in the metrics list has precedence.
 
 If you want to avoid the scores being overwritten by other metrics, you can add a prefix to each metric score.
 
@@ -127,7 +128,7 @@ If you want to avoid the scores being overwritten by other metrics, you can add
    )
 
 Note that the ``score`` and ``score_names`` are always taken from the first metric in the metric list.
- 
+
 Metric Base Classes
 -------------------
 
@@ -139,19 +140,19 @@ scores are calculated.
 
 ``InstanceMetric` - Class for metrics in which the global scores are be calculated by aggregating the instance scores.
 Typically, the global score is the average of all instance scores.  `InstanceMetric` first evaluates each instance separately,
-and then aggregate the instances score.  Some examples of instance metrics are `Accuracy`, `TokenOverlap`, `CharEditDistance`.  
+and then aggregate the instances score.  Some examples of instance metrics are `Accuracy`, `TokenOverlap`, `CharEditDistance`.
 
 ``BulkInstanceMetric`` - Similar to ``InstanceMetric`` , it is for metrics in which the globals score can be calculated by aggregating the instance scores.  However,
 due to implementation efficiently reasons, it's better to run them in bulk (for example, when using LLMs during score calculations).
 ``BulkInstanceMetric`` runs on a batch of instances each time, but then aggregate the instance scores as before.
 Some examples of bulk instance metrics are `SentenceBert`, `Reward`.
 
 ``GlobalMetric`` - Class for metrics for which the global scores must be calculated over all the instances together.
-Some examples of global metrics are `f1`, `Spearman`, `Kendall Tau`.  Note that by default global metrics are executed once per instance 
-to generate per instance scores, and then once again over all instances together.   So if there are 100 instances, 
-it will first be called 100 times , each on a single instance, and then one time on all 100 instances.  
+Some examples of global metrics are `f1`, `Spearman`, `Kendall Tau`.  Note that by default global metrics are executed once per instance
+to generate per instance scores, and then once again over all instances together.   So if there are 100 instances,
+it will first be called 100 times , each on a single instance, and then one time on all 100 instances.
 
-Instance scores of `GlobalMetrics` are useful for error-analysis. Consider f1 score, for example. 
+Instance scores of `GlobalMetrics` are useful for error-analysis. Consider f1 score, for example.
 It can be calculated only on all instances together. Yet it is useful to report the score of every instance
 so you can see that good instances get f1 score of 1 and bad ones get 0.
 
@@ -163,14 +164,14 @@ so you can see that good instances get f1 score of 1 and bad ones get 0.
 Adding a New Instance metric
 ----------------------------
 
-    Assume we want to create a referenceless metric for the task of adding two numbers.   
-    It will take the processed prediction of the task (an integer) and compare to the sum of the 
+    Assume we want to create a referenceless metric for the task of adding two numbers.
+    It will take the processed prediction of the task (an integer) and compare to the sum of the
     two task input fields `num1` and `num2`.  It will check, for each instance,
     how close the predicted sum is to the actual sum.
-    The metric can be configured with a `relative_tolerance` threshold for approximate comparison.  
-    If the difference between the prediction and actual result is smaller than the `relative_tolerance` 
+    The metric can be configured with a `relative_tolerance` threshold for approximate comparison.
+    If the difference between the prediction and actual result is smaller than the `relative_tolerance`
     threshold, the instance score is 1. Otherwise, the instance result is 0.
-    The global accuracy result is the mean of the instance scores.  
+    The global accuracy result is the mean of the instance scores.
 
     .. code-block:: python
 
@@ -179,7 +180,7 @@ Adding a New Instance metric
             main_score = "sum_accuracy" # name of the main score
             reduction_map = {"mean": ["sum_accuracy"]} # defines that the global score is a mean of the instance scores
             ci_scores = ["sum_accuracy"] # define that confidence internal should be calculated on the score
-            prediction_type = "int"      # the metric expect the prediction as an int
+            prediction_type = int      # the metric expect the prediction as an int
 
             # Relation tolerance for errors by default it is 0, but can be changed for approximate comparison
             relative_tolerance : float = 0
@@ -253,15 +254,15 @@ This is a global metric because it performs the calculation over all the instanc
 
         The score is negative (up to -1), if predictions tend to be less accurate when reference values are larger.
         The score is close to 0, if the magnitude of the reference answer does not correlate with accuracy.
-        The score is positive (up to 1), if predictions tend to be less accurate when reference values are smaller.  
+        The score is positive (up to 1), if predictions tend to be less accurate when reference values are smaller.
 
         In most realistic cases, the score is likely to be zer or negative.
 
         """
-        prediction_type = "int"  
+        prediction_type = int
         main_score="sensitivity_to_numeric_magnitude"
         single_reference_per_prediction = True  # validates only one reference is passed per prediction
-      
+
         def compute(
             self, references: List[List[int]], predictions: List[int], task_data: List[Dict]
         ) -> dict:
@@ -277,9 +278,9 @@ This is a global metric because it performs the calculation over all the instanc
 1. Calculating confidence intervals for global metrics can be costly if each invocation of the metric takes a long time.
 To avoid calculating confidence internals for global metrics set `n_resamples = 0`.
 
-2. Unitxt calculates instance results in global metrics to allow viewing the output on a single instances.  
+2. Unitxt calculates instance results in global metrics to allow viewing the output on a single instances.
 This can help ensure metric behavior is correct, because it can be checked on single instance.
-However, sometimes it does not make sense because the global metric assumes a minimum amount of instances.  
+However, sometimes it does not make sense because the global metric assumes a minimum amount of instances.
 The per instance calculations can be disabled by setting `process_single_instances = False`.
 
 Managing Metric Dependencies
@@ -340,22 +341,22 @@ This is done using the predefined HuggingfaceMetric class.
     metric = HuggingfaceMetric(
         hf_metric_name="bleu",  # The name of the metric in huggingface
         main_score="bleu",      # The main score (assumes the metric returns this score name)
-        prediction_type="str"   # The type of the prediction and references (note that by default references are a list of the prediction_type)
+        prediction_type=str   # The type of the prediction and references (note that by default references are a list of the prediction_type)
     )
     add_to_catalog(metric, "metrics.bleu", overwrite=True)
 
-By default, the HuggingfaceMetric wrapper passes the only the `predictions` and `references` fields to 
+By default, the HuggingfaceMetric wrapper passes the only the `predictions` and `references` fields to
 the metrics. You can also pass fields from the task_data inputs, by specifying `hf_additional_input_fields`.
 For example:
 
  .. code-block:: python
 
     metric = HuggingfaceMetric(
         ...
-        hf_additional_input_fields_pass = ["num1","num2"], # passes the task's num1 and num2 fields 
+        hf_additional_input_fields_pass = ["num1","num2"], # passes the task's num1 and num2 fields
         ...
-        
-    )    
+
+    )
 
 In the above example, the `num1` and `num2`fields are passed as lists of values to the metric
 (each element in the list corresponds to an instance). If you want to pass a scalar (single) value to the metric
@@ -367,13 +368,13 @@ you can use:
         ...
         hf_additional_input_fields_pass_one_value=["tokenize"],
         ...
-    )   
-   
+    )
+
 
 This assumes the field has the same value is in all instances.
 
 
 Note that Huggingface metrics are independent from the tasks they are used for, and receive arbitrary types of predictions, references, and additional
 parameters.  It may be need to map between unitxt field names, values and types to the corresponding interface of the metric, using
-the `MetricPipeline` described in the previous section.   
+the `MetricPipeline` described in the previous section.
 
diff --git a/docs/docs/adding_task.rst b/docs/docs/adding_task.rst
@@ -25,9 +25,9 @@ The task is formally defined as:
    from unitxt.blocks import Task
 
    task = Task(
-        input_fields={"num1" : "int", "num2" : "int"},
-        reference_fields={"sum" : "int"},
-        prediction_type="int",
+        input_fields={"num1" : int, "num2" : int},
+        reference_fields={"sum" : int},
+        prediction_type=int,
         metrics=[
             "metrics.sum_accuracy",
             "metrics.sum_accuracy_approximate"

diff --git a/docs/docs/adding_template.rst b/docs/docs/adding_template.rst
@@ -77,30 +77,32 @@ Making Your Custom Template
 ----------------------------
 
 In order to make your own template, you need to create a class inheriting from `Template` and
-implementing its two abstract methods:
+implementing its abstract methods:
 
 .. code-block:: python
 
-    @abstractmethod
-    def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+     @abstractmethod
+    def input_fields_to_source(self, input_fields: Dict[str, object]) -> str:
+        """Create the textual input for the model from the input fields"""
         pass
 
     @abstractmethod
-    def outputs_to_target_and_references(
-        self, outputs: Dict[str, object]
-    ) -> Tuple[str, List[str]]:
+    def reference_fields_to_target_and_references(self, reference_fields: Dict[str, object]) -> Tuple[str, List[str]]:
+        """Create a list of references from the reference fields. Also returns one of the references
+           as the 'target' - the reference used if the instance is used as a demonstration."
         pass
 
-For instance:
+
+
+For instance, this templates passes all the input fields to the model as a json string.
+It also formats the references , by taking two of the dataset reference fields the 'top_answer' and 'alternative_answer'.
 
 .. code-block:: python
 
     class MyCustomTemplate(Template):
 
-        def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
-            return str(inputs) # use all the task inputs fields in their dictionary look
-
-        def outputs_to_target_and_references(
-            self, outputs: Dict[str, object]
-        ) -> Tuple[str, List[str]]:
-            return outputs["label"], [outputs["label"]]
+        def input_fields_to_source(self, inputs_fields: Dict[str, object]) -> str:
+            return json.dumps(inputs_fields) # provide the json string with all fields as the input to the model
+        def reference_fields_to_target_and_references(self, reference_fields: Dict[str, object]) -> Tuple[str, List[str]]
+            return outputs_fields["top_answer"],  # target
+                   [outputs_fields["top_answer"],outputs_fields["alternative_answer"]]   # all references