adding a punct standardization step

chanind · May 9, 2022 · d08877e · d08877e
1 parent 9250636
commit d08877e
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 38 deletions.
diff --git a/frame_semantic_transformer/data/chunk_list.py b/frame_semantic_transformer/data/chunk_list.py
diff --git a/frame_semantic_transformer/data/data_utils.py b/frame_semantic_transformer/data/data_utils.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+import re
+from typing import Iterator, Sequence, TypeVar
+
+T = TypeVar("T")
+
+
+def chunk_list(lst: Sequence[T], chunk_size: int) -> Iterator[Sequence[T]]:
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i : i + chunk_size]
+
+
+def standardize_punct(sent: str) -> str:
+    """
+    Try to standardize things like "He 's a man" -> "He's a man"
+    """
+    # remove space before punct
+    updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?[.',:])", r"\1\2", sent)
+    # remove repeated *'s
+    updated_sent = re.sub(r"\*+", "*", updated_sent)
+    # fix spaces in contractions
+    updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?n't)", r"\1\2", updated_sent)
+
+    return updated_sent
diff --git a/frame_semantic_transformer/data/shuffle_and_split.py b/frame_semantic_transformer/data/shuffle_and_split.py
diff --git a/frame_semantic_transformer/data/task_samples/FrameClassificationSample.py b/frame_semantic_transformer/data/task_samples/FrameClassificationSample.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from frame_semantic_transformer.data.data_utils import standardize_punct
 
 from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample
 
@@ -40,4 +41,4 @@ def trigger_labeled_text(self) -> str:
         pre_span = self.text[0 : self.trigger_loc[0]]
         post_span = self.text[self.trigger_loc[1] :]
         # TODO: handle these special chars better
-        return f"{pre_span}* {self.trigger} *{post_span}"
+        return standardize_punct(f"{pre_span}*{self.trigger}{post_span}")
diff --git a/frame_semantic_transformer/data/task_samples/TriggerIdentificationSample.py b/frame_semantic_transformer/data/task_samples/TriggerIdentificationSample.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from frame_semantic_transformer.data.data_utils import standardize_punct
 
 from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample
 
@@ -25,14 +26,14 @@ def get_target(self) -> str:
             output += self.text[prev_trigger_loc:loc] + "*"
             prev_trigger_loc = loc
         output += self.text[prev_trigger_loc:]
-        return output
+        return standardize_punct(output)
 
     def evaluate_prediction(self, prediction: str) -> tuple[int, int, int]:
         true_pos = 0
         false_pos = 0
         false_neg = 0
 
-        prediction_parts = prediction.split()
+        prediction_parts = standardize_punct(prediction).split()
         target_parts = self.get_target().split()
 
         for i, target_part in enumerate(target_parts):

diff --git a/frame_semantic_transformer/evaluate.py b/frame_semantic_transformer/evaluate.py
@@ -4,7 +4,7 @@
 from tqdm import tqdm
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 
-from frame_semantic_transformer.data.chunk_list import chunk_list
+from frame_semantic_transformer.data.data_utils import chunk_list
 from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample
 from frame_semantic_transformer.predict import batch_predict
 

diff --git a/tests/data/task_samples/test_FrameClassificationSample.py b/tests/data/task_samples/test_FrameClassificationSample.py
@@ -13,9 +13,7 @@
 
 
 def test_get_input() -> None:
-    expected = (
-        "FRAME: Your * contribution * to Goodwill will mean more than you may know ."
-    )
+    expected = "FRAME: Your *contribution to Goodwill will mean more than you may know."
     assert sample.get_input() == expected
 
 

diff --git a/tests/data/task_samples/test_TriggerIdentificationSample.py b/tests/data/task_samples/test_TriggerIdentificationSample.py
@@ -6,38 +6,38 @@
 
 
 sample = TriggerIdentificationSample(
-    text="Your contribution to Goodwill will mean more than you may know .",
+    text="Your contribution to Goodwill will mean more than you may know.",
     trigger_locs=[5, 18, 35, 40, 58, 54],
 )
 
 
 def test_get_input() -> None:
     expected = (
-        "TRIGGER: Your contribution to Goodwill will mean more than you may know ."
+        "TRIGGER: Your contribution to Goodwill will mean more than you may know."
     )
     assert sample.get_input() == expected
 
 
 def test_get_target() -> None:
-    expected = "Your *contribution *to Goodwill will *mean *more than you *may *know ."
+    expected = "Your *contribution *to Goodwill will *mean *more than you *may *know."
     assert sample.get_target() == expected
 
 
 def test_evaluate_prediction() -> None:
-    pred = "Your contribution *to Goodwill *will *mean *more than you may *know ."
+    pred = "Your contribution *to Goodwill *will *mean *more than you may *know."
     assert sample.evaluate_prediction(pred) == (4, 1, 2)
 
 
 def test_evaluate_prediction_fails_for_elements_whose_content_doesnt_match() -> None:
-    pred = "Your AHAHAHAHA *to BADWILL will *PSYCH *more than you may *know ."
+    pred = "Your AHAHAHAHA *to BADWILL will *PSYCH *more than you may *know."
     assert sample.evaluate_prediction(pred) == (3, 1, 3)
 
 
 def test_evaluate_prediction_treats_missing_words_as_wrong() -> None:
     pred = "Your *contribution *to Goodwill will *mean"
-    assert sample.evaluate_prediction(pred) == (3, 3, 3)
+    assert sample.evaluate_prediction(pred) == (3, 2, 3)
 
 
 def test_evaluate_prediction_treats_excess_words_as_false_positives() -> None:
-    pred = "Your *contribution *to Goodwill will *mean *more than you *may *know . ha ha ha ha!"
+    pred = "Your *contribution *to Goodwill will *mean *more than you *may *know. ha ha ha ha!"
     assert sample.evaluate_prediction(pred) == (6, 4, 0)
diff --git a/tests/data/test_data_utils.py b/tests/data/test_data_utils.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from frame_semantic_transformer.data.data_utils import standardize_punct
+
+
+def test_standardize_punct_removes_spaces_before_punctuation() -> None:
+    original = "Old customs are still followed : Fate and luck are taken very seriously , and astrologers and fortune-tellers do a steady business ."
+    expected = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business."
+    assert standardize_punct(original) == expected
+
+
+def test_standardize_punct_leaves_sentences_as_is_if_punct_is_correct() -> None:
+    sent = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business."
+    assert standardize_punct(sent) == sent
+
+
+def test_standardize_punct_leaves_spaces_before_double_apostrophes() -> None:
+    sent = "I really *like my *job. '' -- Sherry"
+    assert standardize_punct(sent) == sent
+
+
+def test_standardize_punct_keeps_asterix_before_apostrophes() -> None:
+    original = "*Shopping *never *ends - *there *'s *always *another inviting *spot"
+    expected = "*Shopping *never *ends - *there*'s *always *another inviting *spot"
+    assert standardize_punct(original) == expected
+
+
+def test_standardize_punct_removes_repeated_asterixes() -> None:
+    original = "*Shopping **never *ends"
+    expected = "*Shopping *never *ends"
+    assert standardize_punct(original) == expected
+
+
+def test_standardize_punct_undoes_spaces_in_contractions() -> None:
+    original = "She did n't say so"
+    expected = "She didn't say so"
+    assert standardize_punct(original) == expected
+
+
+def test_standardize_punct_allows_asterix_in_contractions() -> None:
+    original = "She did *n't say so"
+    expected = "She did*n't say so"
+    assert standardize_punct(original) == expected