From cdba9444be523afcbd05cda63bce44feb452957a Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 19:45:21 +0000
Subject: [PATCH 1/8] Bumped version to 3.4.5, bumped min Python version to
 3.9, fmt, cleanup, etc.

---
 .github/workflows/python-package.yml |  9 ++--
 LICENSE.txt                          |  2 +-
 MANIFEST.in                          |  1 +
 README.rst                           |  3 +-
 pyproject.toml                       | 16 +++----
 src/tokenizer/__init__.py            |  2 +-
 src/tokenizer/abbrev.py              | 69 +++++++++++++++++++---------
 src/tokenizer/definitions.py         |  4 +-
 src/tokenizer/main.py                | 18 ++++++--
 9 files changed, 81 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index e432efc..db75859 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -15,10 +15,11 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]
 
     steps:
     - uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -29,10 +30,10 @@ jobs:
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
 
-    - name: Type check with mypy (only on Python 3.8)
+    - name: Type check with mypy (only on Python 3.9)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
 
     - name: Test with pytest
       run: |
diff --git a/LICENSE.txt b/LICENSE.txt
index 6eebeb7..a3fd327 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (C) 2023 Miðeind ehf.
+Copyright (C) 2016-2024 Miðeind ehf.
 Original author: Vilhjálmur Þorsteinsson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/MANIFEST.in b/MANIFEST.in
index 04cc9cf..0c93fb0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 graft src
 prune src/tokenizer/__pycache__
 prune src/tokenizer/.mypy_cache
+prune src/tokenizer/.DS_Store
\ No newline at end of file
diff --git a/README.rst b/README.rst
index 316697e..50bb5ff 100644
--- a/README.rst
+++ b/README.rst
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
 tasks, such as word counting, parsing, spell checking, corpus generation, and
 statistical analysis of text.
 
-**Tokenizer** is a compact pure-Python (>= 3.8) executable
+**Tokenizer** is a compact pure-Python (>=3.9) executable
 program and module for tokenizing Icelandic text. It converts input text to
 streams of *tokens*, where each token is a separate word, punctuation sign,
 number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
 * Version 3.4.4: Better handling of abbreviations
 * Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
 * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.
diff --git a/pyproject.toml b/pyproject.toml
index 5bd7107..2365a00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,10 @@
 [project]
 name = "tokenizer"
-version = "3.4.4"
+version = "3.4.5"
 description = "A tokenizer for Icelandic text"
 authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
 readme = { file = "README.rst", content-type = "text/x-rst" }
-license = { file = "LICENSE.txt" }
-# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
+license = "MIT"
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
@@ -16,7 +15,6 @@ classifiers = [
     "Natural Language :: Icelandic",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -27,7 +25,7 @@ classifiers = [
     "Topic :: Utilities",
     "Topic :: Text Processing :: Linguistic",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 
 [project.urls]
 Repository = "https://github.com/mideind/Tokenizer"
@@ -51,17 +49,17 @@ where = ["src"]
 [tool.pytest.ini_options]
 filterwarnings = [
     # Ignore deprecation warnings in libraries, their problem not ours
-    "ignore::DeprecationWarning",
+    # "ignore::DeprecationWarning",
 ]
 
 [tool.ruff]
-line-length = 120
+line-length = 88
 
 [tool.black]
-line-length = 120
+line-length = 88
 
 [tool.isort]
 # This forces these imports to placed at the top
 known_future_library = ["__future__", "typing", "typing_extensions"]
 profile = "black"
-line_length = 120
+line_length = 88
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
index 51fba02..6f14e9e 100644
--- a/src/tokenizer/__init__.py
+++ b/src/tokenizer/__init__.py
@@ -63,7 +63,7 @@
 from .abbrev import Abbreviations, ConfigError
 
 __author__ = "Miðeind ehf."
-__copyright__ = "(C) 2023 Miðeind ehf."
+__copyright__ = "(C) 2016-2024 Miðeind ehf."
 __version__ = importlib.metadata.version("tokenizer")
 
 
diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
index a08ce5b..a57c954 100644
--- a/src/tokenizer/abbrev.py
+++ b/src/tokenizer/abbrev.py
@@ -43,7 +43,6 @@
 
 
 class ConfigError(Exception):
-
     pass
 
 
@@ -51,17 +50,16 @@ class ConfigError(Exception):
 
 
 class OrderedSet(Generic[_T]):
-
-    """ Shim class to provide an ordered set API on top
-        of an OrderedDict. This is necessary to make abbreviation
-        lookups predictable and repeatable, which they would not be
-        if a standard Python set() was used. """
+    """Shim class to provide an ordered set API on top
+    of an OrderedDict. This is necessary to make abbreviation
+    lookups predictable and repeatable, which they would not be
+    if a standard Python set() was used."""
 
     def __init__(self) -> None:
         self._dict: Dict[_T, None] = OrderedDict()
 
     def add(self, item: _T) -> None:
-        """ Add an item at the end of the ordered set """
+        """Add an item at the end of the ordered set"""
         if item not in self._dict:
             self._dict[item] = None
 
@@ -73,9 +71,8 @@ def __iter__(self) -> Iterator[_T]:
 
 
 class Abbreviations:
-
-    """ Wrapper around dictionary of abbreviations,
-        initialized from the config file """
+    """Wrapper around dictionary of abbreviations,
+    initialized from the config file"""
 
     # Dictionary of abbreviations and their meanings
     DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
@@ -107,8 +104,8 @@ class Abbreviations:
 
     @staticmethod
     def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
-        """ Add an abbreviation to the dictionary.
-            Called from the config file handler. """
+        """Add an abbreviation to the dictionary.
+        Called from the config file handler."""
         # Check for sentence finishers
         finisher = False
         not_finisher = False
@@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
         # Append the abbreviation and its meaning in tuple form
         # Multiple meanings are supported for each abbreviation
         Abbreviations.DICT[abbrev].add(
-            BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
+            BIN_Tuple(
+                meaning,
+                0,
+                gender,
+                "skst" if fl is None else fl,
+                abbrev,
+                "-",
+            )
         )
         Abbreviations.MEANINGS.add(meaning)
         # Adding wrong versions of abbreviations
@@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 # as abbreviations, even though they are listed as such
                 # in the form 'Í.' and 'Á.' for use within person names
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
 
         elif "." in abbrev:
@@ -182,7 +193,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 wabbrev = abbrev[:i] + abbrev[i + 1 :]
                 Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
             if len(indices) > 2:
                 # 3 or 4 dots currently in vocabulary
@@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
             Abbreviations.WRONGSINGLES.add(wabbrev)
             Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
             Abbreviations.WRONGDICT[wabbrev].add(
-                BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                BIN_Tuple(
+                    meaning,
+                    0,
+                    gender,
+                    "skst" if fl is None else fl,
+                    wabbrev,
+                    "-",
+                )
             )
         if finisher:
             Abbreviations.FINISHERS.add(abbrev)
@@ -233,7 +258,7 @@ def has_abbreviation(meaning: str) -> bool:
 
     @staticmethod
     def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
-        """ Lookup meaning(s) of abbreviation, if available. """
+        """Look up meaning(s) of abbreviation, if available."""
         m = Abbreviations.DICT.get(abbrev)
         if not m:
             m = Abbreviations.WRONGDICT.get(abbrev)
@@ -241,7 +266,7 @@ def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
 
     @staticmethod
     def _handle_abbreviations(s: str) -> None:
-        """ Handle abbreviations in the settings section """
+        """Handle abbreviations in the settings section"""
         # Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
         # An asterisk after an abbreviation ending with a period
         # indicates that the abbreviation may finish a sentence
@@ -272,21 +297,23 @@ def _handle_abbreviations(s: str) -> None:
 
     @staticmethod
     def _handle_not_abbreviations(s: str) -> None:
-        """ Handle not_abbreviations in the settings section """
+        """Handle not_abbreviations in the settings section"""
         if len(s) < 3 or s[0] != '"' or s[-1] != '"':
             raise ConfigError("not_abbreviations should be enclosed in double quotes")
         Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])
 
     @staticmethod
     def initialize():
-        """ Read the abbreviations config file """
+        """Read the abbreviations config file"""
         with Abbreviations._lock:
             if len(Abbreviations.DICT):
                 # Already initialized
                 return
 
             section = None
-            config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
+            config = open_text(
+                package="tokenizer", resource="Abbrev.conf", encoding="utf-8"
+            )  # TODO: Deprecated in Python 3.13
             for s in config:
                 # Ignore comments
                 ix = s.find("#")
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index 9811192..812aeb7 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -534,8 +534,8 @@ class PersonNameTuple(NamedTuple):
     "N": "Norður",
 }
 
-_unit_lambda: Callable[[str], str] = (
-    lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit
+_unit_lambda: Callable[[str], str] = lambda unit: (
+    unit + r"(?!\w)" if unit[-1].isalpha() else unit
 )
 
 SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index 95e0e63..b6a94eb 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -71,8 +71,12 @@
 
 group = parser.add_mutually_exclusive_group()
 
-group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true")
-group.add_argument("--json", help="Output one token per line in JSON format", action="store_true")
+group.add_argument(
+    "--csv", help="Output one token per line in CSV format", action="store_true"
+)
+group.add_argument(
+    "--json", help="Output one token per line in JSON format", action="store_true"
+)
 
 parser.add_argument(
     "-s",
@@ -92,7 +96,10 @@
     "-p",
     "--coalesce_percent",
     action="store_true",
-    help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"),
+    help=(
+        "Numbers combined into one token with percentage word forms "
+        "(prósent/prósentustig/hundraðshlutar)"
+    ),
 )
 
 parser.add_argument(
@@ -127,7 +134,10 @@
     "-c",
     "--convert_numbers",
     action="store_true",
-    help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"),
+    help=(
+        "English-style decimal points and thousands separators "
+        "in numbers changed to Icelandic style"
+    ),
 )
 
 parser.add_argument(

From b26b89ef36ad0729f5e24a961c10eda945956ff1 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 20:14:14 +0000
Subject: [PATCH 2/8] Modern typing annotation: Set, List, Dict -> set, list,
 dict, etc.

---
 src/tokenizer/abbrev.py      | 28 +++++++++----------
 src/tokenizer/definitions.py | 13 ++++-----
 src/tokenizer/main.py        | 12 ++++----
 src/tokenizer/tokenizer.py   | 53 ++++++++++++++++--------------------
 4 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
index a57c954..010cc29 100644
--- a/src/tokenizer/abbrev.py
+++ b/src/tokenizer/abbrev.py
@@ -33,7 +33,7 @@
 
 """
 
-from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
+from typing import Generic, Iterator, Optional, TypeVar
 
 from threading import Lock
 from collections import defaultdict, OrderedDict
@@ -56,7 +56,7 @@ class OrderedSet(Generic[_T]):
     if a standard Python set() was used."""
 
     def __init__(self) -> None:
-        self._dict: Dict[_T, None] = OrderedDict()
+        self._dict: dict[_T, None] = OrderedDict()
 
     def add(self, item: _T) -> None:
         """Add an item at the end of the ordered set"""
@@ -75,29 +75,29 @@ class Abbreviations:
     initialized from the config file"""
 
     # Dictionary of abbreviations and their meanings
-    DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # Wrong versions of abbreviations
-    WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # All abbreviation meanings
-    MEANINGS: Set[str] = set()
+    MEANINGS: set[str] = set()
     # Single-word abbreviations, i.e. those with only one dot at the end
-    SINGLES: Set[str] = set()
+    SINGLES: set[str] = set()
     # Set of abbreviations without periods, e.g. "td", "osfrv"
-    WRONGSINGLES: Set[str] = set()
+    WRONGSINGLES: set[str] = set()
     # Potential sentence finishers, i.e. those with a dot at the end,
     # marked with an asterisk in the config file
-    FINISHERS: Set[str] = set()
+    FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences,
     # marked with an exclamation mark in the config file
-    NOT_FINISHERS: Set[str] = set()
+    NOT_FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences, but
     # are allowed in front of person names; marked with a hat ^ in the config file
-    NAME_FINISHERS: Set[str] = set()
+    NAME_FINISHERS: set[str] = set()
     # Wrong versions of abbreviations with possible corrections
     # wrong version : [correction1, correction2, ...]
-    WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
+    WRONGDOTS: dict[str, list[str]] = defaultdict(list)
     # Word forms that should never be interpreted as abbreviations
-    NOT_ABBREVIATIONS: Set[str] = set()
+    NOT_ABBREVIATIONS: set[str] = set()
 
     # Ensure that only one thread initializes the abbreviations
     _lock = Lock()
@@ -208,7 +208,7 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 i1 = indices[0]
                 i2 = indices[1]
                 i3 = indices[2]
-                wabbrevs: List[str] = []
+                wabbrevs: list[str] = []
                 # 1 and 2 removed
                 wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
                 # 1 and 3 removed
@@ -257,7 +257,7 @@ def has_abbreviation(meaning: str) -> bool:
         return meaning in Abbreviations.MEANINGS
 
     @staticmethod
-    def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
+    def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
         """Look up meaning(s) of abbreviation, if available."""
         m = Abbreviations.DICT.get(abbrev)
         if not m:
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index 812aeb7..b327939 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -29,13 +29,10 @@
 """
 
 from typing import (
-    Dict,
-    FrozenSet,
     Mapping,
     Tuple,
     Union,
     Callable,
-    List,
     Sequence,
     Optional,
     NamedTuple,
@@ -47,13 +44,13 @@
 
 BeginTuple = Tuple[int, Optional[int]]
 PunctuationTuple = Tuple[int, str]
-NumberTuple = Tuple[float, Optional[List[str]], Optional[List[str]]]
+NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]]
 DateTimeTuple = Tuple[int, int, int]
 MeasurementTuple = Tuple[str, float]
 TimeStampTuple = Tuple[int, int, int, int, int, int]
-AmountTuple = Tuple[float, str, Optional[List[str]], Optional[List[str]]]
+AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]]
 TelnoTuple = Tuple[str, str]
-CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]]
+CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]]
 
 
 class BIN_Tuple(NamedTuple):
@@ -434,7 +431,7 @@ class PersonNameTuple(NamedTuple):
 SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞"
 
 # Derived unit : (base SI unit, conversion factor/function)
-SI_UNITS: Dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
+SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
     # Distance
     "m": ("m", 1.0),
     "mm": ("m", 1.0e-3),
@@ -538,7 +535,7 @@ class PersonNameTuple(NamedTuple):
     unit + r"(?!\w)" if unit[-1].isalpha() else unit
 )
 
-SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
+SI_UNITS_SET: frozenset[str] = frozenset(SI_UNITS.keys())
 SI_UNITS_REGEX_STRING = r"|".join(
     map(
         # If the unit ends with a letter, don't allow the next character
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index b6a94eb..aec191d 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -35,7 +35,7 @@
 
 """
 
-from typing import TextIO, Dict, Iterator, List, Callable, Any, Tuple, Union, cast
+from typing import TextIO, Iterator, Callable, Any, Tuple, Union, cast
 
 import sys
 import argparse
@@ -158,14 +158,14 @@ def main() -> None:
     """Main function, called when the tokenize command is invoked"""
 
     args = parser.parse_args()
-    options: Dict[str, bool] = dict()
+    options: dict[str, bool] = dict()
 
     def quote(s: str) -> str:
         """Return the string s within double quotes, and with any contained
         backslashes and double quotes escaped with a backslash"""
         return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
 
-    def spanquote(l: List[int]) -> str:
+    def spanquote(l: list[int]) -> str:
         """Return the list l as a string within double quotes"""
         return '"' + "-".join(str(x) for x in l) + '"'
 
@@ -180,7 +180,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
             return None
         if t.kind == TOK.WORD:
             # Get the full expansion of an abbreviation
-            mm = cast(List[BIN_Tuple], t.val)
+            mm = cast(list[BIN_Tuple], t.val)
             if quote_word:
                 # Return a |-delimited list of possible meanings,
                 # joined into a single string
@@ -254,7 +254,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
 
     # Configure our JSON dump function
     json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
-    curr_sent: List[str] = []
+    curr_sent: list[str] = []
     tsep = "" if args.original else " "  # token separator
     for t in tokenize(gen(args.infile), **options):
         if args.csv:
@@ -275,7 +275,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
                 print('0,"","","",""', file=args.outfile)
         elif args.json:
             # Output the tokens in JSON format, one line per token
-            d: Dict[str, Union[str, List[int]]] = dict(k=TOK.descr[t.kind])
+            d: dict[str, Union[str, list[int]]] = dict(k=TOK.descr[t.kind])
             if t.txt is not None:
                 d["t"] = t.txt
             v = val(t)
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 1a6dfc8..35bee09 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -42,10 +42,8 @@
     Any,
     Callable,
     Deque,
-    FrozenSet,
     Iterable,
     Iterator,
-    List,
     Mapping,
     Match,
     Optional,
@@ -77,7 +75,6 @@
 
 
 class Tok:
-
     """Information about a single token"""
 
     def __init__(
@@ -86,7 +83,7 @@ def __init__(
         txt: Optional[str],
         val: ValType,
         original: Optional[str] = None,
-        origin_spans: Optional[List[int]] = None,
+        origin_spans: Optional[list[int]] = None,
     ) -> None:
         # Type of token
         self.kind: int = kind
@@ -101,7 +98,7 @@ def __init__(
         # Each such integer index maps the corresponding character
         # (which may have substitutions) to its index in 'original'.
         # This is required to preserve 'original' correctly when splitting.
-        self.origin_spans: Optional[List[int]] = origin_spans
+        self.origin_spans: Optional[list[int]] = origin_spans
 
     @classmethod
     def from_txt(cls: Type[_T], txt: str) -> _T:
@@ -312,7 +309,7 @@ def concatenate(
 
         self_origin_spans = self.origin_spans or []
         other_origin_spans = other.origin_spans or []
-        separator_origin_spans: List[int] = (
+        separator_origin_spans: list[int] = (
             [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
         )
         new_origin_spans = (
@@ -373,7 +370,6 @@ def quoted_string_repr(obj: Any) -> str:
 
 
 class TOK:
-
     """
     The TOK class contains constants that define token types and
     constructors for creating token instances.
@@ -647,8 +643,8 @@ def Email(t: Union[Tok, str]) -> Tok:
     def Number(
         t: Union[Tok, str],
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this number
         # (if it was originally stated in words)
@@ -670,8 +666,8 @@ def NumberWithLetter(t: Union[Tok, str], n: int, c: str) -> Tok:
     def Currency(
         t: Union[Tok, str],
         iso: str,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this currency name
         # (if it was originally stated in words, i.e. not abbreviated)
@@ -686,8 +682,8 @@ def Amount(
         t: Union[Tok, str],
         iso: str,
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this amount
         # (if it was originally stated in words)
@@ -701,8 +697,8 @@ def Amount(
     def Percent(
         t: Union[Tok, str],
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.PERCENT, t, (n, cases, genders))
@@ -1559,7 +1555,7 @@ def generate_raw_tokens(
 
 def could_be_end_of_sentence(
     next_token: Tok,
-    test_set: FrozenSet[int] = TOK.TEXT,
+    test_set: frozenset[int] = TOK.TEXT,
     multiplier: bool = False,
 ) -> bool:
     """Return True if next_token could be ending the current sentence or
@@ -1578,7 +1574,6 @@ def could_be_end_of_sentence(
 
 
 class LetterParser:
-
     """Parses a sequence of alphabetic characters
     off the front of a raw token"""
 
@@ -1663,7 +1658,6 @@ def parse(self) -> Iterable[Tok]:
 
 
 class NumberParser:
-
     """Parses a sequence of digits off the front of a raw token"""
 
     def __init__(
@@ -1724,7 +1718,6 @@ def parse(self) -> Iterable[Tok]:
 
 
 class PunctuationParser:
-
     """Parses a sequence of punctuation off the front of a raw token"""
 
     def __init__(self) -> None:
@@ -2108,7 +2101,7 @@ def is_abbr_with_period(txt: str) -> bool:
             return txt not in Abbreviations.DICT
         return False
 
-    def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
+    def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]:
         """Look up an abbreviation, both in original case and in lower case,
         and return either None if not found or a meaning list having one entry"""
         m = Abbreviations.DICT.get(abbrev)
@@ -2647,7 +2640,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 if abbrev in Abbreviations.FINISHERS:
                     token = TOK.Word(
                         token.concatenate(next_token),
-                        cast(Optional[List[BIN_Tuple]], token.val),
+                        cast(Optional[list[BIN_Tuple]], token.val),
                     )
                     next_token = next(token_stream)
 
@@ -2975,7 +2968,7 @@ def parse_phrases_2(
             # Check for composites:
             # 'stjórnskipunar- og eftirlitsnefnd'
             # 'dómsmála-, viðskipta- og iðnaðarráðherra'
-            tq: List[Tok] = []
+            tq: list[Tok] = []
             while token.kind == TOK.WORD and next_token.punctuation == COMPOSITE_HYPHEN:
                 # Accumulate the prefix in tq
                 tq.append(token)
@@ -3081,7 +3074,7 @@ def split_into_sentences(
         to_text = lambda t: t.original or t.txt
     else:
         to_text = lambda t: t.txt
-    curr_sent: List[str] = []
+    curr_sent: list[str] = []
     for t in tokenize_without_annotation(text_or_gen, **options):
         if t.kind in TOK.END:
             # End of sentence/paragraph
@@ -3111,14 +3104,14 @@ def mark_paragraphs(txt: str) -> str:
     return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]"
 
 
-def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
+def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[Tuple[int, list[Tok]]]]:
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
     sentence, not including the starting TOK.S_BEGIN or the terminating TOK.S_END
     tokens."""
 
-    def valid_sent(sent: Optional[List[Tok]]) -> bool:
+    def valid_sent(sent: Optional[list[Tok]]) -> bool:
         """Return True if the token list in sent is a proper
         sentence that we want to process further"""
         if not sent:
@@ -3126,9 +3119,9 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
         # A sentence with only punctuation is not valid
         return any(t[0] != TOK.PUNCTUATION for t in sent)
 
-    sent: List[Tok] = []  # Current sentence
+    sent: list[Tok] = []  # Current sentence
     sent_begin = 0
-    current_p: List[Tuple[int, List[Tok]]] = []  # Current paragraph
+    current_p: list[Tuple[int, list[Tok]]] = []  # Current paragraph
 
     for ix, t in enumerate(tokens):
         t0 = t[0]
@@ -3184,7 +3177,7 @@ def correct_spaces(s: str) -> str:
     with correct spacing between tokens.
     NOTE that this function uses a quick-and-dirty approach
     which may not handle all edge cases!"""
-    r: List[str] = []
+    r: list[str] = []
     last = TP_NONE
     double_quote_count = 0
     for w in RE_SPLIT.split(s):
@@ -3244,7 +3237,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
     to a correctly spaced string. If normalize is True,
     punctuation is normalized before assembling the string."""
     to_text: Callable[[Tok], str] = normalized_text if normalize else lambda t: t.txt
-    r: List[str] = []
+    r: list[str] = []
     last = TP_NONE
     double_quote_count = 0
     for t in tokens:
@@ -3278,7 +3271,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
 
 def calculate_indexes(
     tokens: Iterable[Tok], last_is_end: bool = False
-) -> Tuple[List[int], List[int]]:
+) -> Tuple[list[int], list[int]]:
     """Calculate character and byte indexes for a token stream.
     The indexes are the start positions of each token in the original
     text that was tokenized.

From d8a33510319b30e36699294b7249d81d18d3702c Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 20:27:01 +0000
Subject: [PATCH 3/8] More type modernization

---
 src/tokenizer/definitions.py | 23 +++++++++++------------
 src/tokenizer/tokenizer.py   |  6 +++---
 test/test_tokenizer.py       | 18 +++++++++---------
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index b327939..1093d78 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -30,7 +30,6 @@
 
 from typing import (
     Mapping,
-    Tuple,
     Union,
     Callable,
     Sequence,
@@ -42,15 +41,15 @@
 import re
 
 
-BeginTuple = Tuple[int, Optional[int]]
-PunctuationTuple = Tuple[int, str]
-NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]]
-DateTimeTuple = Tuple[int, int, int]
-MeasurementTuple = Tuple[str, float]
-TimeStampTuple = Tuple[int, int, int, int, int, int]
-AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]]
-TelnoTuple = Tuple[str, str]
-CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]]
+BeginTuple = tuple[int, Optional[int]]
+PunctuationTuple = tuple[int, str]
+NumberTuple = tuple[float, Optional[list[str]], Optional[list[str]]]
+DateTimeTuple = tuple[int, int, int]
+MeasurementTuple = tuple[str, float]
+TimeStampTuple = tuple[int, int, int, int, int, int]
+AmountTuple = tuple[float, str, Optional[list[str]], Optional[list[str]]]
+TelnoTuple = tuple[str, str]
+CurrencyTuple = tuple[str, Optional[list[str]], Optional[list[str]]]
 
 
 class BIN_Tuple(NamedTuple):
@@ -339,7 +338,7 @@ class PersonNameTuple(NamedTuple):
 # }
 
 # Time of day expressions spelled out
-CLOCK_NUMBERS: Mapping[str, Tuple[int, int, int]] = {
+CLOCK_NUMBERS: Mapping[str, tuple[int, int, int]] = {
     "eitt": (1, 0, 0),
     "tvö": (2, 0, 0),
     "þrjú": (3, 0, 0),
@@ -431,7 +430,7 @@ class PersonNameTuple(NamedTuple):
 SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞"
 
 # Derived unit : (base SI unit, conversion factor/function)
-SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
+SI_UNITS: dict[str, tuple[str, Union[float, Callable[[float], float]]]] = {
     # Distance
     "m": ("m", 1.0),
     "mm": ("m", 1.0e-3),
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 35bee09..959dff3 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -3104,7 +3104,7 @@ def mark_paragraphs(txt: str) -> str:
     return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]"
 
 
-def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[Tuple[int, list[Tok]]]]:
+def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[tuple[int, list[Tok]]]]:
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
@@ -3121,7 +3121,7 @@ def valid_sent(sent: Optional[list[Tok]]) -> bool:
 
     sent: list[Tok] = []  # Current sentence
     sent_begin = 0
-    current_p: list[Tuple[int, list[Tok]]] = []  # Current paragraph
+    current_p: list[tuple[int, list[Tok]]] = []  # Current paragraph
 
     for ix, t in enumerate(tokens):
         t0 = t[0]
@@ -3271,7 +3271,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
 
 def calculate_indexes(
     tokens: Iterable[Tok], last_is_end: bool = False
-) -> Tuple[list[int], list[int]]:
+) -> tuple[list[int], list[int]]:
     """Calculate character and byte indexes for a token stream.
     The indexes are the start positions of each token in the original
     text that was tokenized.
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 01116fa..21431d8 100755
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -31,7 +31,7 @@
 
 """
 
-from typing import Any, Iterable, Iterator, List, Tuple, Union, cast
+from typing import Any, Iterable, Iterator, Union, cast
 
 import tokenizer as t
 from tokenizer.definitions import BIN_Tuple, ValType
@@ -39,10 +39,10 @@
 TOK = t.TOK
 Tok = t.Tok
 
-TestCase = Union[Tuple[str, int], Tuple[str, int, ValType], Tuple[str, List[Tok]]]
+TestCase = Union[tuple[str, int], tuple[str, int, ValType], tuple[str, list[Tok]]]
 
 
-def strip_originals(tokens: List[Tok]) -> List[Tok]:
+def strip_originals(tokens: list[Tok]) -> list[Tok]:
     """Remove origin tracking info from a list of tokens.
     This is useful for simplifying tests where we don't care about tracking
     origins.
@@ -57,7 +57,7 @@ def strip_originals(tokens: List[Tok]) -> List[Tok]:
     return tokens
 
 
-def get_text_and_norm(orig: str) -> Tuple[str, str]:
+def get_text_and_norm(orig: str) -> tuple[str, str]:
     toklist = list(t.tokenize(orig))
     return t.text_from_tokens(toklist), t.normalized_text_from_tokens(toklist)
 
@@ -563,12 +563,12 @@ def test_single_tokens() -> None:
     def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
         for test_case in test_cases:
             if len(test_case) == 3:
-                txt, kind, val = cast(Tuple[str, int, ValType], test_case)
+                txt, kind, val = cast(tuple[str, int, ValType], test_case)
                 c = [Tok(kind, txt, val)]
             elif isinstance(test_case[1], list):
-                txt, c = cast(Tuple[str, List[Tok]], test_case)
+                txt, c = cast(tuple[str, list[Tok]], test_case)
             else:
-                txt, kind = cast(Tuple[str, int], test_case)
+                txt, kind = cast(tuple[str, int], test_case)
                 c = [Tok(kind, txt, None)]
             l = list(t.tokenize(txt, **options))
             assert len(l) == len(c) + 2, repr(l)
@@ -593,8 +593,8 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
                     if check.kind == TOK.WORD:
                         # Test set equivalence, since the order of word meanings
                         # is not deterministic
-                        assert set(cast(List[BIN_Tuple], tok.val) or []) == set(
-                            cast(List[BIN_Tuple], check.val) or []
+                        assert set(cast(list[BIN_Tuple], tok.val) or []) == set(
+                            cast(list[BIN_Tuple], check.val) or []
                         ), (repr(tok.val) + " != " + repr(check.val))
                     else:
                         assert tok.val == check.val, (

From b8909a45cdb2d7b2ce491136bb9d5852571fd62c Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 20:39:24 +0000
Subject: [PATCH 4/8] Fixed license entry in pyproject metadata

---
 .github/workflows/python-package.yml | 2 +-
 pyproject.toml                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index db75859..15df45f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -30,7 +30,7 @@ jobs:
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
 
-    - name: Type check with mypy (only on Python 3.9)
+    - name: Type check with mypy (only on oldest supported Python version)
       run: |
         if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
         if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
diff --git a/pyproject.toml b/pyproject.toml
index 2365a00..3ab24ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ version = "3.4.5"
 description = "A tokenizer for Icelandic text"
 authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
 readme = { file = "README.rst", content-type = "text/x-rst" }
-license = "MIT"
+license = { text = "MIT" }
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",

From 7c2bbb841d0f0934f3815292db4cfcca409f74cf Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 21:21:57 +0000
Subject: [PATCH 5/8] More type modernization

---
 src/tokenizer/main.py      |  4 ++--
 src/tokenizer/tokenizer.py | 21 ++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index aec191d..dea083e 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -35,7 +35,7 @@
 
 """
 
-from typing import TextIO, Iterator, Callable, Any, Tuple, Union, cast
+from typing import TextIO, Iterator, Callable, Any, Union, cast
 
 import sys
 import argparse
@@ -213,7 +213,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
             TOK.MEASUREMENT,
         }:
             # Return a |-delimited list of numbers
-            vv = cast(Tuple[Any, ...], t.val)
+            vv = cast(tuple[Any, ...], t.val)
             return quote("|".join(str(v) for v in vv))
         if quote_word and isinstance(t.val, str):
             return quote(t.val)
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 959dff3..4108108 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -47,7 +47,6 @@
     Mapping,
     Match,
     Optional,
-    Tuple,
     Type,
     TypeVar,
     Union,
@@ -181,7 +180,7 @@ def person_names(self) -> PersonNameList:
             return []
         return cast(PersonNameList, self.val) or []
 
-    def split(self, pos: int) -> Tuple["Tok", "Tok"]:
+    def split(self, pos: int) -> tuple["Tok", "Tok"]:
         """Split this token into two at 'pos'.
         The first token returned will have 'pos'
         characters and the second one will have the rest.
@@ -224,7 +223,7 @@ def split(self, pos: int) -> Tuple["Tok", "Tok"]:
 
         return l, r
 
-    def substitute(self, span: Tuple[int, int], new: str) -> None:
+    def substitute(self, span: tuple[int, int], new: str) -> None:
         """Substitute a span with a single or empty character 'new'."""
         self.txt = self.txt[: span[0]] + new + self.txt[span[1] :]
         if self.origin_spans is not None:
@@ -233,7 +232,7 @@ def substitute(self, span: Tuple[int, int], new: str) -> None:
                 self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :]
             )
 
-    def substitute_longer(self, span: Tuple[int, int], new: str) -> None:
+    def substitute_longer(self, span: tuple[int, int], new: str) -> None:
         """Substitute a span with a potentially longer string"""
 
         # This tracks origin differently from the regular
@@ -321,7 +320,7 @@ def concatenate(
         return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans)
 
     @property
-    def as_tuple(self) -> Tuple[Any, ...]:
+    def as_tuple(self) -> tuple[Any, ...]:
         """Return the contents of this token as a generic tuple,
         suitable e.g. for serialization"""
         return (self.kind, self.txt, self.val)
@@ -950,7 +949,7 @@ def person_names(self, i: int = 0) -> Optional[PersonNameList]:
         t = self[i]
         return t.person_names if t else None
 
-    def as_tuple(self, i: int = 0) -> Optional[Tuple[Any, ...]]:
+    def as_tuple(self, i: int = 0) -> Optional[tuple[Any, ...]]:
         """Return token.as_tuple for token at index i."""
         t = self[i]
         return t.as_tuple if t else None
@@ -964,7 +963,7 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool:
 def normalized_text(token: Tok) -> str:
     """Returns token text after normalizing punctuation"""
     return (
-        cast(Tuple[int, str], token.val)[1]
+        cast(tuple[int, str], token.val)[1]
         if token.kind == TOK.PUNCTUATION
         else token.txt
     )
@@ -991,7 +990,7 @@ def is_valid_date(y: int, m: int, d: int) -> bool:
     return False
 
 
-def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]:
+def parse_digits(tok: Tok, convert_numbers: bool) -> tuple[Tok, Tok]:
     """Parse a raw token starting with a digit"""
     w = tok.txt
     s: Optional[Match[str]] = re.match(r"\d{1,2}:\d\d:\d\d,\d\d(?!\d)", w)
@@ -1334,7 +1333,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]:
     )
 
 
-def html_escape(match: Match[str]) -> Tuple[Tuple[int, int], str]:
+def html_escape(match: Match[str]) -> tuple[tuple[int, int], str]:
     """Regex substitution function for HTML escape codes"""
     g = match.group(4)
     if g is not None:
@@ -1397,7 +1396,7 @@ def generate_rough_tokens_from_tok(tok: Tok) -> Iterator[Tok]:
     # This function further splits those tokens into multiple tokens.
     # Rough tokens are tokens that are separated by white space, i.e. the regex (\\s*)."""
 
-    def shift_span(span: Tuple[int, int], pos: int):
+    def shift_span(span: tuple[int, int], pos: int):
         """Shift a span by a given amount"""
         return (span[SPAN_START] + pos, span[SPAN_END] + pos)
 
@@ -2138,7 +2137,7 @@ def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]:
                     and not token_stream.could_be_end_of_sentence()
                 ):
                     # This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
-                    y, m, d = cast(Tuple[int, int, int], token.val)
+                    y, m, d = cast(tuple[int, int, int], token.val)
                     token = TOK.Daterel(token.concatenate(next_token), y, m, d)
                     next_token = next(token_stream)
 

From 12a2db2e29a9d6523a77936fd434d15ac80ad69d Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 21:41:44 +0000
Subject: [PATCH 6/8] Updated header + formatting

---
 src/tokenizer/__init__.py      |   5 +-
 src/tokenizer/abbrev.py        |   2 +-
 src/tokenizer/definitions.py   |   2 +-
 src/tokenizer/main.py          |   2 +-
 src/tokenizer/tokenizer.py     |   2 +-
 test/test_detokenize.py        |  19 ++--
 test/test_index_calculation.py | 195 +++++++++++++++++++++++++++++----
 test/test_tokenizer.py         |   2 +-
 test/test_tokenizer_tok.py     |   2 +-
 9 files changed, 193 insertions(+), 38 deletions(-)

diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
index 6f14e9e..d57468b 100644
--- a/src/tokenizer/__init__.py
+++ b/src/tokenizer/__init__.py
@@ -1,6 +1,6 @@
 """
 
-    Copyright(C) 2022 Miðeind ehf.
+    Copyright(C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -64,8 +64,7 @@
 
 __author__ = "Miðeind ehf."
 __copyright__ = "(C) 2016-2024 Miðeind ehf."
-__version__ = importlib.metadata.version("tokenizer")
-
+__version__ = importlib.metadata.version(__name__)
 
 __all__ = (
     "__author__",
diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
index 010cc29..bcee6b2 100644
--- a/src/tokenizer/abbrev.py
+++ b/src/tokenizer/abbrev.py
@@ -2,7 +2,7 @@
 
     Abbreviations module for tokenization of Icelandic text
 
-    Copyright (C) 2022 Miðeind ehf.
+    Copyright (C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index 1093d78..fd3eb42 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -2,7 +2,7 @@
 
     Definitions used for tokenization of Icelandic text
 
-    Copyright (C) 2022 Miðeind ehf.
+    Copyright (C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index dea083e..55d74c1 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -3,7 +3,7 @@
 
     Tokenizer for Icelandic text
 
-    Copyright (C) 2022 Miðeind ehf.
+    Copyright (C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 4108108..1089581 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -2,7 +2,7 @@
 
     Tokenizer for Icelandic text
 
-    Copyright (C) 2022 Miðeind ehf.
+    Copyright (C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/test/test_detokenize.py b/test/test_detokenize.py
index db5f413..09545e1 100644
--- a/test/test_detokenize.py
+++ b/test/test_detokenize.py
@@ -6,7 +6,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2022 by Miðeind ehf.
+    Copyright (C) 2016-2024 by Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -37,7 +37,7 @@
 
 def test_detokenize() -> None:
 
-    options = { "normalize": True }
+    options = {"normalize": True}
 
     def should_be_equal(s: str) -> None:
         toklist = t.tokenize(s, **options)
@@ -58,19 +58,18 @@ def should_be(s1: str, s2: str) -> None:
     should_be_equal("Páll veiddi 74 cm. lax í Norðurá þann 1.3.")
 
     should_be(
-        "Páll var með \"netfangið\" palli@einn.i.heiminum.is.",
-        "Páll var með „netfangið“ palli@einn.i.heiminum.is."
+        'Páll var með "netfangið" palli@einn.i.heiminum.is.',
+        "Páll var með „netfangið“ palli@einn.i.heiminum.is.",
     )
 
     # !!! BUG
-    #should_be(
+    # should_be(
     #    "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").",
     #    "Páll var með „netfangið“, þ.e.a.s. („þetta“).",
-    #)
+    # )
 
-    options = { "normalize": False }
+    options = {"normalize": False}
 
     should_be_equal("Páll var með „netfangið“, þ.e.a.s. („þetta“).")
-    should_be_equal("Páll var með \"netfangið\" palli@einn.i.heiminum.is.")
-    should_be_equal("Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").")
-
+    should_be_equal('Páll var með "netfangið" palli@einn.i.heiminum.is.')
+    should_be_equal('Páll var með "netfangið", þ.e.a.s. ("þetta").')
diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py
index 0b59e35..60a81a8 100644
--- a/test/test_index_calculation.py
+++ b/test/test_index_calculation.py
@@ -6,7 +6,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2022 by Miðeind ehf.
+    Copyright (C) 2016-2024 by Miðeind ehf.
 
     This software is licensed under the MIT License:
 
@@ -169,7 +169,6 @@ def test_small_difficult_cases() -> None:
         assert char_indexes == [0, 2, 4]
         assert byte_indexes == [0, 2, 4]
 
-
     # Two byte characters
     for x in ["þ", "æ", "á"]:
         s = x
@@ -230,12 +229,11 @@ def test_small_difficult_cases() -> None:
         assert char_indexes == [0, 2, 4]
         assert byte_indexes == [0, 3, 6]
 
-
     # Two character characters
     # These strings contain two unicode code points that are rendered as one letter.
     # They are counted as two characters in python.
     # In addition the accent and umlaut characters are two bytes.
-    for x in ["a"+ACCENT, "o"+UMLAUT]:
+    for x in ["a" + ACCENT, "o" + UMLAUT]:
         s = x
         toks = tokenizer.parse_tokens([s])
         char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
@@ -288,11 +286,11 @@ def test_small_difficult_cases() -> None:
         # example chars:
         #   " a´ a´"
         #    012345
-        #    ^  ^    
+        #    ^  ^
         # example bytes:
         #   " a´_ a´_"
         #    01234567
-        #    ^   ^  
+        #    ^   ^
         toks = tokenizer.parse_tokens([s])
         char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
         assert char_indexes == [0, 3]
@@ -302,7 +300,6 @@ def test_small_difficult_cases() -> None:
         assert char_indexes == [0, 3, 6]
         assert byte_indexes == [0, 4, 8]
 
-
     # The em-dash is 3 bytes
     for x in [EM_DASH]:
         s = x
@@ -361,7 +358,7 @@ def test_small_difficult_cases() -> None:
         # example bytes:
         #   " a__ a__"
         #    01234567
-        #    ^   ^  
+        #    ^   ^
         toks = tokenizer.parse_tokens([s])
         char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
         assert char_indexes == [0, 2]
@@ -379,25 +376,181 @@ def test_larger_case() -> None:
     #    x                x                     x  xx                   x
     toks = tokenizer.parse_tokens([s])
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
-    assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
-    assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
+    assert char_indexes == [
+        0,
+        5,
+        13,
+        16,
+        18,
+        25,
+        30,
+        33,
+        36,
+        40,
+        45,
+        50,
+        53,
+        61,
+        66,
+        72,
+    ]
+    assert byte_indexes == [
+        0,
+        6,
+        14,
+        17,
+        20,
+        27,
+        32,
+        35,
+        38,
+        43,
+        50,
+        55,
+        58,
+        66,
+        72,
+        78,
+    ]
     toks = tokenizer.parse_tokens([s])
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
-    assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
-    assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
+    assert char_indexes == [
+        0,
+        5,
+        13,
+        16,
+        18,
+        25,
+        30,
+        33,
+        36,
+        40,
+        45,
+        50,
+        53,
+        61,
+        66,
+        72,
+        73,
+    ]
+    assert byte_indexes == [
+        0,
+        6,
+        14,
+        17,
+        20,
+        27,
+        32,
+        35,
+        38,
+        43,
+        50,
+        55,
+        58,
+        66,
+        72,
+        78,
+        79,
+    ]
 
 
 def test_iterator_cases() -> None:
-    s = ["Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."]
+    s = [
+        "Þessi ",
+        "setning ",
+        "er ",
+        "í ",
+        "lengra ",
+        "lagi ",
+        "og ",
+        "er ",
+        "með ",
+        "bæði ",
+        "eins ",
+        "og ",
+        "tveggja ",
+        "bæta ",
+        "stafi.",
+    ]
     # (char and byte indexes in a similar test above)
     toks = tokenizer.parse_tokens(s)
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
-    assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
-    assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
+    assert char_indexes == [
+        0,
+        5,
+        13,
+        16,
+        18,
+        25,
+        30,
+        33,
+        36,
+        40,
+        45,
+        50,
+        53,
+        61,
+        66,
+        72,
+    ]
+    assert byte_indexes == [
+        0,
+        6,
+        14,
+        17,
+        20,
+        27,
+        32,
+        35,
+        38,
+        43,
+        50,
+        55,
+        58,
+        66,
+        72,
+        78,
+    ]
     toks = tokenizer.parse_tokens(s)
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
-    assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
-    assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
+    assert char_indexes == [
+        0,
+        5,
+        13,
+        16,
+        18,
+        25,
+        30,
+        33,
+        36,
+        40,
+        45,
+        50,
+        53,
+        61,
+        66,
+        72,
+        73,
+    ]
+    assert byte_indexes == [
+        0,
+        6,
+        14,
+        17,
+        20,
+        27,
+        32,
+        35,
+        38,
+        43,
+        50,
+        55,
+        58,
+        66,
+        72,
+        78,
+        79,
+    ]
 
     s = ["Stutt setning.", "", "Önnur setning."]
     #     01234567890123        45678901234567
@@ -493,11 +646,15 @@ def test_lengthening_substitutions() -> None:
     #    ^    ^  ^   ^        ^
     #    x             x
     #             !             lengthening happens here (3ji->þriðji)
-    toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
+    toks = tokenizer.parse_tokens(
+        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
+    )
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
     assert char_indexes == [0, 5, 8, 12, 21]
     assert byte_indexes == [0, 6, 9, 13, 23]
-    toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
+    toks = tokenizer.parse_tokens(
+        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
+    )
     char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
     assert char_indexes == [0, 5, 8, 12, 21, 22]
     assert byte_indexes == [0, 6, 9, 13, 23, 24]
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 21431d8..7b37088 100755
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -5,7 +5,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2022 by Miðeind ehf.
+    Copyright (C) 2016-2024 by Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/test/test_tokenizer_tok.py b/test/test_tokenizer_tok.py
index 3ee1f46..c0caa7e 100644
--- a/test/test_tokenizer_tok.py
+++ b/test/test_tokenizer_tok.py
@@ -3,7 +3,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2022 by Miðeind ehf.
+    Copyright (C) 2016-2024 by Miðeind ehf.
 
     This software is licensed under the MIT License:
 

From 491a765797bc230390c18bb82f3312ef820a1023 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Fri, 23 Aug 2024 00:13:23 +0000
Subject: [PATCH 7/8] Refactoring + explicit Python 3.13 support in metadata
 (tbd)

---
 pyproject.toml               |  2 ++
 src/tokenizer/definitions.py | 11 ++++++++---
 src/tokenizer/tokenizer.py   |  2 +-
 test/test_tokenizer.py       |  2 +-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3ab24ae..8e3d464 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: Unix",
     "Operating System :: POSIX",
+    "Operating System :: MacOS",
     "Operating System :: Microsoft :: Windows",
     "Natural Language :: Icelandic",
     "Programming Language :: Python",
@@ -19,6 +20,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "Topic :: Software Development :: Libraries :: Python Modules",
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index fd3eb42..12a82d3 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -717,9 +717,8 @@ def roman_to_int(s: str) -> int:
 # Króna amount strings allowed before a number, e.g. "kr. 9.900"
 ISK_AMOUNT_PRECEDING = frozenset(("kr.", "kr", "krónur"))
 
-# URL prefixes. Note that this list should not contain www since
-# www.something.com is a domain token, not a URL token.
-URL_PREFIXES = (
+# URI scheme prefixes
+URI_PREFIXES = (
     "http://",
     "https://",
     "file://",
@@ -735,6 +734,12 @@ def roman_to_int(s: str) -> int:
     "telnet://",
     "udp://",
     "vnc://",
+    "irc://",
+    "nntp://",
+    "wss://",
+    "ws://",
+    "xmpp://",
+    "mtqp://",
 )
 
 TOP_LEVEL_DOMAINS = frozenset(
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 1089581..c85dbf9 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -1848,7 +1848,7 @@ def parse_mixed(
             ate = True
 
         rtxt = rt.txt
-        if rtxt and rtxt.startswith(URL_PREFIXES):
+        if rtxt and rtxt.startswith(URI_PREFIXES):
             # Handle URL: cut RIGHT_PUNCTUATION characters off its end,
             # even though many of them are actually allowed according to
             # the IETF RFC
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 7b37088..193cfef 100755
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -46,7 +46,7 @@ def strip_originals(tokens: list[Tok]) -> list[Tok]:
     """Remove origin tracking info from a list of tokens.
     This is useful for simplifying tests where we don't care about tracking
     origins.
-    XXX: This could be removed if we get a feature to disable origin
+    TODO: This could be removed if we get a feature to disable origin
     tracking during tokenization.
     """
 

From 7f5c92b4d4a9ffc6aca71787b6a78f9de519bb3f Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Fri, 23 Aug 2024 14:52:16 +0000
Subject: [PATCH 8/8] Migrated from deprecated open_text method to modern
 importlib_resources.files API

---
 src/tokenizer/abbrev.py    | 11 ++++++-----
 src/tokenizer/tokenizer.py |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
index bcee6b2..606f10c 100644
--- a/src/tokenizer/abbrev.py
+++ b/src/tokenizer/abbrev.py
@@ -37,7 +37,7 @@
 
 from threading import Lock
 from collections import defaultdict, OrderedDict
-from importlib.resources import open_text
+import importlib.resources as importlib_resources
 
 from .definitions import BIN_Tuple
 
@@ -311,10 +311,11 @@ def initialize():
                 return
 
             section = None
-            config = open_text(
-                package="tokenizer", resource="Abbrev.conf", encoding="utf-8"
-            )  # TODO: Deprecated in Python 3.13
-            for s in config:
+
+            p = importlib_resources.files("tokenizer").joinpath("Abbrev.conf")
+            config = p.read_text(encoding="utf-8")
+
+            for s in config.split("\n"):
                 # Ignore comments
                 ix = s.find("#")
                 if ix >= 0:
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index c85dbf9..2e7be72 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -1923,7 +1923,7 @@ def parse_mixed(
 
         # Check for currency abbreviations immediately followed by a number
         if len(rt.txt) > 3 and rt.txt[0:3] in CURRENCY_ABBREV and rt.txt[3].isdigit():
-            # XXX: This feels a little hacky
+            # TODO: This feels a little hacky
             temp_tok = Tok(TOK.RAW, rt.txt[3:], None)
             digit_tok, _ = parse_digits(temp_tok, convert_numbers)
             if digit_tok.kind == TOK.NUMBER: