Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Render Text2TextGenerationEvaluators' docstring examples #463

Merged
merged 3 commits into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"tensorflow>=2.3,!=2.6.0,!=2.6.1, <=2.10",
"torch",
# metrics dependencies
"accelerate", # for frugalscore (calls transformers' Trainer)
"bert_score>=0.3.6",
"rouge_score>=0.1.2",
"sacrebleu",
Expand Down
2 changes: 1 addition & 1 deletion src/evaluate/evaluation_suite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
@dataclass
class SubTask:
task_type: str
data: [Union[str, Dataset]] = None
data: Optional[Union[str, Dataset]] = None
subset: Optional[str] = None
split: Optional[str] = None
data_preprocessor: Optional[Callable] = None
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:

Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/automatic_speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -22,6 +22,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

# Lint as: python3
from datasets import Dataset
Expand All @@ -32,6 +32,10 @@
from .utils import DatasetColumn


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


logger = get_logger(__name__)


Expand Down
120 changes: 68 additions & 52 deletions src/evaluate/evaluator/text2text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset
from typing_extensions import Literal
Expand All @@ -22,6 +22,10 @@
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION_KWARGS = r"""
input_column (`str`, defaults to `"text"`):
the name of the column containing the input text in the dataset specified by `data`.
Expand All @@ -31,6 +35,55 @@
The generation kwargs are passed to the pipeline and set the text generation strategy.
"""

TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text2text-generation")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> metric="rouge",
>>> )
```
"""

SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("summarization")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> )
```
"""


TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("translation")
>>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
>>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
>>> data=data,
>>> )
```
"""


class Text2TextGenerationEvaluator(Evaluator):
"""
Expand All @@ -50,7 +103,10 @@ def predictions_processor(self, predictions, label_mapping):
return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -71,23 +127,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text2text-generation")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> metric="rouge",
>>> )
```
"""

if generation_kwargs is not None:
self.PIPELINE_KWARGS.update(generation_kwargs)

Expand Down Expand Up @@ -125,7 +164,10 @@ def __init__(self, task="summarization", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -146,22 +188,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("summarization")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> )
```
"""

result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
Expand All @@ -176,6 +202,7 @@ def compute(
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)

return result
Expand All @@ -196,7 +223,10 @@ def __init__(self, task="translation", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)

@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING, TASK_DOCUMENTATION_KWARGS, EVALUATOR_COMPUTE_RETURN_DOCSTRING
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TRANSLATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
Expand All @@ -217,21 +247,6 @@ def compute(
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("translation")
>>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
>>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
>>> data=data,
>>> )
```
"""

result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
Expand All @@ -246,6 +261,7 @@ def compute(
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)

return result
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from numbers import Number
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union

from datasets import Dataset, load_dataset
from typing_extensions import Literal
Expand All @@ -24,6 +24,10 @@
from .utils import DatasetColumnPair


if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
Examples:
```python
Expand Down
6 changes: 5 additions & 1 deletion src/evaluate/evaluator/token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

from datasets import ClassLabel, Dataset, Sequence
from typing_extensions import Literal
Expand All @@ -23,6 +23,10 @@
from .utils import DatasetColumn


if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel


TASK_DOCUMENTATION = r"""
The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.

Expand Down