Skip to content

Commit

Permalink
adding a punct standardization step
Browse files Browse the repository at this point in the history
  • Loading branch information
chanind committed May 9, 2022
1 parent 9250636 commit d08877e
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 38 deletions.
9 changes: 0 additions & 9 deletions frame_semantic_transformer/data/chunk_list.py

This file was deleted.

24 changes: 24 additions & 0 deletions frame_semantic_transformer/data/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from __future__ import annotations
import re
from typing import Iterator, Sequence, TypeVar

T = TypeVar("T")


def chunk_list(lst: Sequence[T], chunk_size: int) -> Iterator[Sequence[T]]:
for i in range(0, len(lst), chunk_size):
yield lst[i : i + chunk_size]


def standardize_punct(sent: str) -> str:
"""
Try to standardize things like "He 's a man" -> "He's a man"
"""
# remove space before punct
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?[.',:])", r"\1\2", sent)
# remove repeated *'s
updated_sent = re.sub(r"\*+", "*", updated_sent)
# fix spaces in contractions
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?n't)", r"\1\2", updated_sent)

return updated_sent
15 changes: 0 additions & 15 deletions frame_semantic_transformer/data/shuffle_and_split.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from frame_semantic_transformer.data.data_utils import standardize_punct

from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample

Expand Down Expand Up @@ -40,4 +41,4 @@ def trigger_labeled_text(self) -> str:
pre_span = self.text[0 : self.trigger_loc[0]]
post_span = self.text[self.trigger_loc[1] :]
# TODO: handle these special chars better
return f"{pre_span}* {self.trigger} *{post_span}"
return standardize_punct(f"{pre_span}*{self.trigger}{post_span}")
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from frame_semantic_transformer.data.data_utils import standardize_punct

from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample

Expand All @@ -25,14 +26,14 @@ def get_target(self) -> str:
output += self.text[prev_trigger_loc:loc] + "*"
prev_trigger_loc = loc
output += self.text[prev_trigger_loc:]
return output
return standardize_punct(output)

def evaluate_prediction(self, prediction: str) -> tuple[int, int, int]:
true_pos = 0
false_pos = 0
false_neg = 0

prediction_parts = prediction.split()
prediction_parts = standardize_punct(prediction).split()
target_parts = self.get_target().split()

for i, target_part in enumerate(target_parts):
Expand Down
2 changes: 1 addition & 1 deletion frame_semantic_transformer/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from tqdm import tqdm
from transformers import T5ForConditionalGeneration, T5Tokenizer

from frame_semantic_transformer.data.chunk_list import chunk_list
from frame_semantic_transformer.data.data_utils import chunk_list
from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample
from frame_semantic_transformer.predict import batch_predict

Expand Down
4 changes: 1 addition & 3 deletions tests/data/task_samples/test_FrameClassificationSample.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@


def test_get_input() -> None:
expected = (
"FRAME: Your * contribution * to Goodwill will mean more than you may know ."
)
expected = "FRAME: Your *contribution to Goodwill will mean more than you may know."
assert sample.get_input() == expected


Expand Down
14 changes: 7 additions & 7 deletions tests/data/task_samples/test_TriggerIdentificationSample.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,38 @@


sample = TriggerIdentificationSample(
text="Your contribution to Goodwill will mean more than you may know .",
text="Your contribution to Goodwill will mean more than you may know.",
trigger_locs=[5, 18, 35, 40, 58, 54],
)


def test_get_input() -> None:
expected = (
"TRIGGER: Your contribution to Goodwill will mean more than you may know ."
"TRIGGER: Your contribution to Goodwill will mean more than you may know."
)
assert sample.get_input() == expected


def test_get_target() -> None:
expected = "Your *contribution *to Goodwill will *mean *more than you *may *know ."
expected = "Your *contribution *to Goodwill will *mean *more than you *may *know."
assert sample.get_target() == expected


def test_evaluate_prediction() -> None:
pred = "Your contribution *to Goodwill *will *mean *more than you may *know ."
pred = "Your contribution *to Goodwill *will *mean *more than you may *know."
assert sample.evaluate_prediction(pred) == (4, 1, 2)


def test_evaluate_prediction_fails_for_elements_whose_content_doesnt_match() -> None:
pred = "Your AHAHAHAHA *to BADWILL will *PSYCH *more than you may *know ."
pred = "Your AHAHAHAHA *to BADWILL will *PSYCH *more than you may *know."
assert sample.evaluate_prediction(pred) == (3, 1, 3)


def test_evaluate_prediction_treats_missing_words_as_wrong() -> None:
pred = "Your *contribution *to Goodwill will *mean"
assert sample.evaluate_prediction(pred) == (3, 3, 3)
assert sample.evaluate_prediction(pred) == (3, 2, 3)


def test_evaluate_prediction_treats_excess_words_as_false_positives() -> None:
pred = "Your *contribution *to Goodwill will *mean *more than you *may *know . ha ha ha ha!"
pred = "Your *contribution *to Goodwill will *mean *more than you *may *know. ha ha ha ha!"
assert sample.evaluate_prediction(pred) == (6, 4, 0)
43 changes: 43 additions & 0 deletions tests/data/test_data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from __future__ import annotations

from frame_semantic_transformer.data.data_utils import standardize_punct


def test_standardize_punct_removes_spaces_before_punctuation() -> None:
original = "Old customs are still followed : Fate and luck are taken very seriously , and astrologers and fortune-tellers do a steady business ."
expected = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business."
assert standardize_punct(original) == expected


def test_standardize_punct_leaves_sentences_as_is_if_punct_is_correct() -> None:
sent = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business."
assert standardize_punct(sent) == sent


def test_standardize_punct_leaves_spaces_before_double_apostrophes() -> None:
sent = "I really *like my *job. '' -- Sherry"
assert standardize_punct(sent) == sent


def test_standardize_punct_keeps_asterix_before_apostrophes() -> None:
original = "*Shopping *never *ends - *there *'s *always *another inviting *spot"
expected = "*Shopping *never *ends - *there*'s *always *another inviting *spot"
assert standardize_punct(original) == expected


def test_standardize_punct_removes_repeated_asterixes() -> None:
original = "*Shopping **never *ends"
expected = "*Shopping *never *ends"
assert standardize_punct(original) == expected


def test_standardize_punct_undoes_spaces_in_contractions() -> None:
original = "She did n't say so"
expected = "She didn't say so"
assert standardize_punct(original) == expected


def test_standardize_punct_allows_asterix_in_contractions() -> None:
original = "She did *n't say so"
expected = "She did*n't say so"
assert standardize_punct(original) == expected

0 comments on commit d08877e

Please sign in to comment.