-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
81 additions
and
38 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from __future__ import annotations | ||
import re | ||
from typing import Iterator, Sequence, TypeVar | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
def chunk_list(lst: Sequence[T], chunk_size: int) -> Iterator[Sequence[T]]: | ||
for i in range(0, len(lst), chunk_size): | ||
yield lst[i : i + chunk_size] | ||
|
||
|
||
def standardize_punct(sent: str) -> str: | ||
""" | ||
Try to standardize things like "He 's a man" -> "He's a man" | ||
""" | ||
# remove space before punct | ||
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?[.',:])", r"\1\2", sent) | ||
# remove repeated *'s | ||
updated_sent = re.sub(r"\*+", "*", updated_sent) | ||
# fix spaces in contractions | ||
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?n't)", r"\1\2", updated_sent) | ||
|
||
return updated_sent |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from __future__ import annotations | ||
|
||
from frame_semantic_transformer.data.data_utils import standardize_punct | ||
|
||
|
||
def test_standardize_punct_removes_spaces_before_punctuation() -> None: | ||
original = "Old customs are still followed : Fate and luck are taken very seriously , and astrologers and fortune-tellers do a steady business ." | ||
expected = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business." | ||
assert standardize_punct(original) == expected | ||
|
||
|
||
def test_standardize_punct_leaves_sentences_as_is_if_punct_is_correct() -> None: | ||
sent = "Old customs are still followed: Fate and luck are taken very seriously, and astrologers and fortune-tellers do a steady business." | ||
assert standardize_punct(sent) == sent | ||
|
||
|
||
def test_standardize_punct_leaves_spaces_before_double_apostrophes() -> None: | ||
sent = "I really *like my *job. '' -- Sherry" | ||
assert standardize_punct(sent) == sent | ||
|
||
|
||
def test_standardize_punct_keeps_asterix_before_apostrophes() -> None: | ||
original = "*Shopping *never *ends - *there *'s *always *another inviting *spot" | ||
expected = "*Shopping *never *ends - *there*'s *always *another inviting *spot" | ||
assert standardize_punct(original) == expected | ||
|
||
|
||
def test_standardize_punct_removes_repeated_asterixes() -> None: | ||
original = "*Shopping **never *ends" | ||
expected = "*Shopping *never *ends" | ||
assert standardize_punct(original) == expected | ||
|
||
|
||
def test_standardize_punct_undoes_spaces_in_contractions() -> None: | ||
original = "She did n't say so" | ||
expected = "She didn't say so" | ||
assert standardize_punct(original) == expected | ||
|
||
|
||
def test_standardize_punct_allows_asterix_in_contractions() -> None: | ||
original = "She did *n't say so" | ||
expected = "She did*n't say so" | ||
assert standardize_punct(original) == expected |