huggingface · lvwerra · May 23, 2023 · May 21, 2023 · May 21, 2023 · May 22, 2023
diff --git a/setup.py b/setup.py
@@ -115,6 +115,7 @@
     "tensorflow>=2.3,!=2.6.0,!=2.6.1, <=2.10",
     "torch",
     # metrics dependencies
+    "accelerate",  # for frugalscore (calls transformers' Trainer)
     "bert_score>=0.3.6",
     "rouge_score>=0.1.2",
     "sacrebleu",

diff --git a/src/evaluate/evaluation_suite/__init__.py b/src/evaluate/evaluation_suite/__init__.py
@@ -18,7 +18,7 @@
 @dataclass
 class SubTask:
     task_type: str
-    data: [Union[str, Dataset]] = None
+    data: Optional[Union[str, Dataset]] = None
     subset: Optional[str] = None
     split: Optional[str] = None
     data_preprocessor: Optional[Callable] = None

diff --git a/src/evaluate/evaluator/audio_classification.py b/src/evaluate/evaluator/audio_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
 from datasets import Dataset
 from typing_extensions import Literal
@@ -23,6 +23,10 @@
 from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
 
 
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION = r"""
     Examples:
 

diff --git a/src/evaluate/evaluator/automatic_speech_recognition.py b/src/evaluate/evaluator/automatic_speech_recognition.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
 from datasets import Dataset
 from typing_extensions import Literal
@@ -22,6 +22,10 @@
 from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
 
 
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION = r"""
     Examples:
     ```python

diff --git a/src/evaluate/evaluator/image_classification.py b/src/evaluate/evaluator/image_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
 from datasets import Dataset
 from typing_extensions import Literal
@@ -23,6 +23,10 @@
 from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
 
 
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION = r"""
     Examples:
     ```python

diff --git a/src/evaluate/evaluator/question_answering.py b/src/evaluate/evaluator/question_answering.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 # Lint as: python3
 from datasets import Dataset
@@ -32,6 +32,10 @@
 from .utils import DatasetColumn
 
 
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 logger = get_logger(__name__)
 
 

diff --git a/src/evaluate/evaluator/text2text_generation.py b/src/evaluate/evaluator/text2text_generation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
 from datasets import Dataset
 from typing_extensions import Literal
@@ -22,6 +22,10 @@
 from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
 
 
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION_KWARGS = r"""
         input_column (`str`, defaults to `"text"`):
             the name of the column containing the input text in the dataset specified by `data`.
@@ -31,6 +35,55 @@
             The generation kwargs are passed to the pipeline and set the text generation strategy.
 """
 
+TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("text2text-generation")
+    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="facebook/bart-large-cnn",
+    >>>     data=data,
+    >>>     input_column="article",
+    >>>     label_column="highlights",
+    >>>     metric="rouge",
+    >>> )
+    ```
+"""
+
+SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("summarization")
+    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="facebook/bart-large-cnn",
+    >>>     data=data,
+    >>>     input_column="article",
+    >>>     label_column="highlights",
+    >>> )
+    ```
+"""
+
+
+TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("translation")
+    >>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
+    >>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
+    >>>     data=data,
+    >>> )
+    ```
+"""
+
 
 class Text2TextGenerationEvaluator(Evaluator):
     """
@@ -50,7 +103,10 @@ def predictions_processor(self, predictions, label_mapping):
         return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}
 
     @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
     )
     def compute(
         self,
@@ -71,23 +127,6 @@ def compute(
         label_column: str = "label",
         generation_kwargs: dict = None,
     ) -> Tuple[Dict[str, float], Any]:
-        """
-        Examples:
-        ```python
-        >>> from evaluate import evaluator
-        >>> from datasets import load_dataset
-        >>> task_evaluator = evaluator("text2text-generation")
-        >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
-        >>> results = task_evaluator.compute(
-        >>>     model_or_pipeline="facebook/bart-large-cnn",
-        >>>     data=data,
-        >>>     input_column="article",
-        >>>     label_column="highlights",
-        >>>     metric="rouge",
-        >>> )
-        ```
-        """
-
         if generation_kwargs is not None:
             self.PIPELINE_KWARGS.update(generation_kwargs)
 
@@ -125,7 +164,10 @@ def __init__(self, task="summarization", default_metric_name=None):
         super().__init__(task, default_metric_name=default_metric_name)
 
     @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
     )
     def compute(
         self,
@@ -146,22 +188,6 @@ def compute(
         label_column: str = "label",
         generation_kwargs: dict = None,
     ) -> Tuple[Dict[str, float], Any]:
-        """
-        Examples:
-        ```python
-        >>> from evaluate import evaluator
-        >>> from datasets import load_dataset
-        >>> task_evaluator = evaluator("summarization")
-        >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
-        >>> results = task_evaluator.compute(
-        >>>     model_or_pipeline="facebook/bart-large-cnn",
-        >>>     data=data,
-        >>>     input_column="article",
-        >>>     label_column="highlights",
-        >>> )
-        ```
-        """
-
         result = super().compute(
             model_or_pipeline=model_or_pipeline,
             data=data,
@@ -176,6 +202,7 @@ def compute(
             random_state=random_state,
             input_column=input_column,
             label_column=label_column,
+            generation_kwargs=generation_kwargs,
         )
 
         return result
@@ -196,7 +223,10 @@ def __init__(self, task="translation", default_metric_name=None):
         super().__init__(task, default_metric_name=default_metric_name)
 
     @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        TRANSLATION_TASK_DOCSTRING_EXAMPLE,
     )
     def compute(
         self,
@@ -217,21 +247,6 @@ def compute(
         label_column: str = "label",
         generation_kwargs: dict = None,
     ) -> Tuple[Dict[str, float], Any]:
-        """
-        Examples:
-        ```python
-        >>> from evaluate import evaluator
-        >>> from datasets import load_dataset
-        >>> task_evaluator = evaluator("translation")
-        >>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
-        >>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
-        >>> results = task_evaluator.compute(
-        >>>     model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
-        >>>     data=data,
-        >>> )
-        ```
-        """
-
         result = super().compute(
             model_or_pipeline=model_or_pipeline,
             data=data,
@@ -246,6 +261,7 @@ def compute(
             random_state=random_state,
             input_column=input_column,
             label_column=label_column,
+            generation_kwargs=generation_kwargs,
         )
 
         return result
diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
 from datasets import Dataset, load_dataset
 from typing_extensions import Literal
@@ -24,6 +24,10 @@
 from .utils import DatasetColumnPair
 
 
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION = r"""
     Examples:
     ```python

diff --git a/src/evaluate/evaluator/token_classification.py b/src/evaluate/evaluator/token_classification.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 from datasets import ClassLabel, Dataset, Sequence
 from typing_extensions import Literal
@@ -23,6 +23,10 @@
 from .utils import DatasetColumn
 
 
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
 TASK_DOCUMENTATION = r"""
     The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.