Skip to content

Commit

Permalink
Render Text2TextGenerationEvaluators' docstring examples (#463)
Browse files Browse the repository at this point in the history
* Fix evaluator's docstring examples

* Fix CI

* Minor fix
  • Loading branch information
mariosasko authored May 23, 2023
1 parent 0ca575d commit c6d906f
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 59 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"tensorflow>=2.3,!=2.6.0,!=2.6.1, <=2.10",
"torch",
# metrics dependencies
"accelerate", # for frugalscore (calls transformers' Trainer)
"bert_score>=0.3.6",
"rouge_score>=0.1.2",
"sacrebleu",
Expand Down
2 changes: 1 addition & 1 deletion src/evaluate/evaluation_suite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
@dataclass
class SubTask:
task_type: str
data: [Union[str, Dataset]] = None
data: Optional[Union[str, Dataset]] = None
subset: Optional[str] = None
split: Optional[str] = None
data_preprocessor: Optional[Callable] = None
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/automatic_speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -22,6 +22,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

# Lint as: python3
from datasets import Dataset
Expand All @@ -32,6 +32,10 @@
from .utils import DatasetColumn


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


logger = get_logger(__name__)


Expand Down
120 changes: 68 additions & 52 deletions src/evaluate/evaluator/text2text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -22,6 +22,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION_KWARGS = r"""
input_column (`str`, defaults to `"text"`):
the name of the column containing the input text in the dataset specified by `data`.
Expand All @@ -31,6 +35,55 @@
The generation kwargs are passed to the pipeline and set the text generation strategy.
"""

TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text2text-generation")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> metric="rouge",
>>> )
```
"""

SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("summarization")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> )
```
"""


TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("translation")
>>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
>>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
>>> data=data,
>>> )
```
"""


class Text2TextGenerationEvaluator(Evaluator):
"""
Expand All @@ -50,7 +103,10 @@ def predictions_processor(self, predictions, label_mapping):
return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -71,23 +127,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text2text-generation")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> metric="rouge",
>>> )
```
"""

if generation_kwargs is not None:
self.PIPELINE_KWARGS.update(generation_kwargs)

Expand Down Expand Up @@ -125,7 +164,10 @@ def __init__(self, task="summarization", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -146,22 +188,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("summarization")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> )
```
"""

result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
Expand All @@ -176,6 +202,7 @@ def compute(
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)

return result
Expand All @@ -196,7 +223,10 @@ def __init__(self, task="translation", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TRANSLATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -217,21 +247,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("translation")
>>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
>>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
>>> data=data,
>>> )
```
"""

result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
Expand All @@ -246,6 +261,7 @@ def compute(
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)

return result
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset, load_dataset
from typing_extensions import Literal
Expand All @@ -24,6 +24,10 @@
from .utils import DatasetColumnPair


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

from datasets import ClassLabel, Dataset, Sequence
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .utils import DatasetColumn


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.
Expand Down

0 comments on commit c6d906f

Please sign in to comment.