Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added handling for abbreviations #47

Merged
merged 9 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,8 @@ def concatenate(
[len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
)
new_origin_spans = (
self_origin_spans
+ separator_origin_spans
self_origin_spans
+ separator_origin_spans
+ [i + len(self_original) for i in other_origin_spans]
)

Expand Down Expand Up @@ -1453,7 +1453,7 @@ def generate_raw_tokens(
big_text: str

for big_text in text_or_gen:

if not one_sent_per_line and not big_text:
# An explicit empty string in the input always
# causes a sentence split
Expand Down Expand Up @@ -1821,7 +1821,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
self.rt = rt
self.ate = ate


def parse_mixed(
rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
) -> Iterable[Tok]:
Expand All @@ -1831,7 +1830,6 @@ def parse_mixed(
pp = PunctuationParser()

while rt.txt:

# Handle punctuation
yield from pp.parse(rt)
rt, ate = pp.rt, pp.ate
Expand Down Expand Up @@ -2143,7 +2141,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
# i.e. with a trailing period: It can end a sentence
if token.kind == TOK.DATEREL and "." in token.txt:
if (
next_token.txt == "."
next_token.txt == "."
and not token_stream.could_be_end_of_sentence()
):
# This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
Expand All @@ -2155,8 +2153,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
# abbreviation token
if next_token.punctuation == ".":
if (
token.kind == TOK.WORD
and token.txt[-1] != "."
token.kind == TOK.WORD
and token.txt[-1] != "."
and is_abbr_with_period(token.txt)
):
# Abbreviation ending with period: make a special token for it
Expand Down Expand Up @@ -2196,7 +2194,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
# Set token to the period
token = next_token
elif (
abbrev in Abbreviations.NOT_FINISHERS
abbrev in Abbreviations.NOT_FINISHERS
or abbrev.lower() in Abbreviations.NOT_FINISHERS
):
# This is a potential abbreviation that we don't interpret
Expand Down Expand Up @@ -2338,8 +2336,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
# OK: replace the number/Roman numeral and the period
# with an ordinal token
num = (
token.integer
if token.kind == TOK.NUMBER
token.integer
if token.kind == TOK.NUMBER
else roman_to_int(token.txt)
)
token = TOK.Ordinal(token.concatenate(next_token), num)
Expand Down Expand Up @@ -2464,8 +2462,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:

# Cases such as 19 $, 199.99 $
if (
token.kind == TOK.NUMBER
and next_token.kind == TOK.PUNCTUATION
token.kind == TOK.NUMBER
and next_token.kind == TOK.PUNCTUATION
and next_token.txt in CURRENCY_SYMBOLS
):
token = TOK.Amount(
Expand Down Expand Up @@ -2552,7 +2550,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
yield tok_begin_sentence
in_sentence = True
if (
token.punctuation in PUNCT_INDIRECT_SPEECH
token.punctuation in PUNCT_INDIRECT_SPEECH
and next_token.punctuation in DQUOTES
):
yield token
Expand All @@ -2576,7 +2574,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
):
# Combining punctuation ('??!!!')
while (
token.punctuation in PUNCT_COMBINATIONS
token.punctuation in PUNCT_COMBINATIONS
and next_token.punctuation in PUNCT_COMBINATIONS
):
# The normalized form comes from the first token except with "…?"
Expand Down Expand Up @@ -2731,8 +2729,8 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
next_token = next(token_stream)

if (
token.kind == TOK.NUMBER
and next_token.kind == TOK.TELNO
token.kind == TOK.NUMBER
and next_token.kind == TOK.TELNO
and token.txt in COUNTRY_CODES
):
# Check for country code in front of telephone number
Expand Down Expand Up @@ -3178,6 +3176,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
r"|([\+\-\$€]?\d+\,\d+(?!\.\d))" # -1234,56
# The following regex catches English numbers with a dot only
r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56
# The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)"
# Finally, space and punctuation
r"|([~\s"
+ "".join("\\" + c for c in PUNCTUATION)
Expand Down
6 changes: 6 additions & 0 deletions test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,12 @@ def test_correct_spaces() -> None:
assert s == "Jón-sem var formaður—mótmælti málinu."
s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C")
assert s == "Það á að geyma mjólkina við 20±3° C"
s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
assert s == "Við förum t.d. til Íslands o.s.frv."
s = t.correct_spaces("M.a. lögum við bil.")
assert s == "M.a. lögum við bil."
s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
assert s == "HANN BORÐAR Þ.Á.M. EPLI."


def test_abbrev() -> None:
Expand Down