Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added handling for abbreviations #47

Merged
merged 9 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,6 @@ def generate_raw_tokens(
big_text: str

for big_text in text_or_gen:

if not one_sent_per_line and not big_text:
# An explicit empty string in the input always
# causes a sentence split
Expand Down Expand Up @@ -1831,7 +1830,6 @@ def parse_mixed(
pp = PunctuationParser()

while rt.txt:

# Handle punctuation
yield from pp.parse(rt)
rt, ate = pp.rt, pp.ate
Expand Down Expand Up @@ -2350,7 +2348,6 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
if (
token.kind == TOK.NUMBER or token.kind == TOK.YEAR
) and next_token.txt in SI_UNITS:

value = token.number
orig_unit = next_token.txt
unit: str
Expand Down Expand Up @@ -2504,7 +2501,6 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
tok_end_sentence = TOK.End_Sentence()

try:

# Maintain a one-token lookahead
token = next(token_stream)
while True:
Expand Down Expand Up @@ -2641,7 +2637,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)
while True:
Expand Down Expand Up @@ -2700,7 +2695,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:

# Check for [date] [year]
if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:

dt = cast(DateTimeTuple, token.val)
if not dt[0]:
# No year yet: add it
Expand Down Expand Up @@ -2760,7 +2754,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)

Expand Down Expand Up @@ -2920,12 +2913,10 @@ def parse_phrases_2(

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)

while True:

next_token = next(token_stream)

# Logic for numbers and fractions that are partially or entirely
Expand All @@ -2945,7 +2936,6 @@ def parse_phrases_2(

# Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:

if next_token.txt in AMOUNT_ABBREV:
# Abbreviations for ISK amounts
# For abbreviations, we do not know the case,
Expand Down Expand Up @@ -3122,7 +3112,6 @@ def mark_paragraphs(txt: str) -> str:


def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:

"""Generator yielding paragraphs from token iterable. Each paragraph is a list
of sentence tuples. Sentence tuples consist of the index of the first token
of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
Expand Down Expand Up @@ -3178,6 +3167,10 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
r"|([\+\-\$€]?\d+\,\d+(?!\.\d))" # -1234,56
# The following regex catches English numbers with a dot only
r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56
# The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)"
# The following regex catches degree characters, i.e. °C, °F
r"|(°[CF])"
# Finally, space and punctuation
r"|([~\s"
+ "".join("\\" + c for c in PUNCTUATION)
Expand Down
22 changes: 17 additions & 5 deletions test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def get_text_and_norm(orig: str) -> Tuple[str, str]:


def test_single_tokens() -> None:

TEST_CASES = [
(".", TOK.PUNCTUATION),
(",", TOK.PUNCTUATION),
Expand Down Expand Up @@ -616,7 +615,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:


def test_sentences() -> None:

KIND = {
"B": TOK.S_BEGIN,
"E": TOK.S_END,
Expand Down Expand Up @@ -646,7 +644,6 @@ def test_sentences() -> None:
}

def test_sentence(text: str, expected: str, **options: Any) -> None:

exp = expected.split()
s = list(t.tokenize(text, **options))
assert len(s) == len(exp)
Expand Down Expand Up @@ -1131,7 +1128,23 @@ def test_correct_spaces() -> None:
s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.")
assert s == "Jón-sem var formaður—mótmælti málinu."
s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C")
assert s == "Það á að geyma mjólkina við 20±3° C"
assert s == "Það á að geyma mjólkina við 20±3 °C"
s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
assert s == "Við förum t.d. til Íslands o.s.frv."
s = t.correct_spaces("Við förum t. d. til Íslands o. s. frv.")
assert (
s == "Við förum t. d. til Íslands o. s. frv."
) # This shouldn't be corrected here
s = t.correct_spaces("M.a. lögum við bil.")
assert s == "M.a. lögum við bil."
s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
assert s == "HANN BORÐAR Þ.Á.M. EPLI."
s = t.correct_spaces("Ég fór til Írlands 6.júní og þar var 17.4°C hiti eða 230.3K.")
assert s == "Ég fór til Írlands 6. júní og þar var 17.4 °C hiti eða 230.3 K."
s = t.correct_spaces(
"Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?"
)
assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?"


def test_abbrev() -> None:
Expand Down Expand Up @@ -2556,7 +2569,6 @@ def test_one_sent_per_line() -> None:


if __name__ == "__main__":

test_single_tokens()
test_sentences()
test_correct_spaces()
Expand Down