mideind · thorunna · Nov 13, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -316,8 +316,8 @@ def concatenate(
             [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
         )
         new_origin_spans = (
-            self_origin_spans
-            + separator_origin_spans
+            self_origin_spans 
+            + separator_origin_spans 
             + [i + len(self_original) for i in other_origin_spans]
         )
 
@@ -1453,7 +1453,7 @@ def generate_raw_tokens(
     big_text: str
 
     for big_text in text_or_gen:
-
+        
         if not one_sent_per_line and not big_text:
             # An explicit empty string in the input always
             # causes a sentence split
@@ -1821,7 +1821,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
         self.rt = rt
         self.ate = ate
 
-
 def parse_mixed(
     rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
 ) -> Iterable[Tok]:
@@ -1831,7 +1830,6 @@ def parse_mixed(
     pp = PunctuationParser()
 
     while rt.txt:
-
         # Handle punctuation
         yield from pp.parse(rt)
         rt, ate = pp.rt, pp.ate
@@ -2143,7 +2141,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # i.e. with a trailing period: It can end a sentence
             if token.kind == TOK.DATEREL and "." in token.txt:
                 if (
-                    next_token.txt == "."
+                    next_token.txt == "." 
                     and not token_stream.could_be_end_of_sentence()
                 ):
                     # This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
@@ -2155,8 +2153,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # abbreviation token
             if next_token.punctuation == ".":
                 if (
-                    token.kind == TOK.WORD
-                    and token.txt[-1] != "."
+                    token.kind == TOK.WORD 
+                    and token.txt[-1] != "." 
                     and is_abbr_with_period(token.txt)
                 ):
                     # Abbreviation ending with period: make a special token for it
@@ -2196,7 +2194,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             # Set token to the period
                             token = next_token
                         elif (
-                            abbrev in Abbreviations.NOT_FINISHERS
+                            abbrev in Abbreviations.NOT_FINISHERS 
                             or abbrev.lower() in Abbreviations.NOT_FINISHERS
                         ):
                             # This is a potential abbreviation that we don't interpret
@@ -2338,8 +2336,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                         # OK: replace the number/Roman numeral and the period
                         # with an ordinal token
                         num = (
-                            token.integer
-                            if token.kind == TOK.NUMBER
+                            token.integer 
+                            if token.kind == TOK.NUMBER 
                             else roman_to_int(token.txt)
                         )
                         token = TOK.Ordinal(token.concatenate(next_token), num)
@@ -2464,8 +2462,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
 
             # Cases such as 19 $, 199.99 $
             if (
-                token.kind == TOK.NUMBER
-                and next_token.kind == TOK.PUNCTUATION
+                token.kind == TOK.NUMBER 
+                and next_token.kind == TOK.PUNCTUATION 
                 and next_token.txt in CURRENCY_SYMBOLS
             ):
                 token = TOK.Amount(
@@ -2552,7 +2550,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     yield tok_begin_sentence
                     in_sentence = True
                 if (
-                    token.punctuation in PUNCT_INDIRECT_SPEECH
+                    token.punctuation in PUNCT_INDIRECT_SPEECH 
                     and next_token.punctuation in DQUOTES
                 ):
                     yield token
@@ -2576,7 +2574,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 ):
                     # Combining punctuation ('??!!!')
                     while (
-                        token.punctuation in PUNCT_COMBINATIONS
+                        token.punctuation in PUNCT_COMBINATIONS 
                         and next_token.punctuation in PUNCT_COMBINATIONS
                     ):
                         # The normalized form comes from the first token except with "…?"
@@ -2731,8 +2729,8 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 next_token = next(token_stream)
 
             if (
-                token.kind == TOK.NUMBER
-                and next_token.kind == TOK.TELNO
+                token.kind == TOK.NUMBER 
+                and next_token.kind == TOK.TELNO 
                 and token.txt in COUNTRY_CODES
             ):
                 # Check for country code in front of telephone number
@@ -3178,6 +3176,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     r"|([\+\-\$€]?\d+\,\d+(?!\.\d))"  # -1234,56
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
+    # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
+    r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)"
     # Finally, space and punctuation
     r"|([~\s"
     + "".join("\\" + c for c in PUNCTUATION)

diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -1132,6 +1132,12 @@ def test_correct_spaces() -> None:
     assert s == "Jón-sem var formaður—mótmælti málinu."
     s = t.correct_spaces("Það á   að geyma mjólkina við  20 ±  3 °C")
     assert s == "Það á að geyma mjólkina við 20±3° C"
+    s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
+    assert s == "Við förum t.d. til Íslands o.s.frv."
+    s = t.correct_spaces("M.a. lögum við bil.")
+    assert s == "M.a. lögum við bil."
+    s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
+    assert s == "HANN BORÐAR Þ.Á.M. EPLI."
 
 
 def test_abbrev() -> None: