Merge remote-tracking branch 'origin/master' into fix-cast

mideind · Mar 23, 2024 · b5bd606 · b5bd606
2 parents 7e22d70 + 5524df4
commit b5bd606
Show file tree

Hide file tree

Showing 13 changed files with 126 additions and 185 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -5,9 +5,11 @@ name: tests
 
 on:
   push:
-    branches: [ master ]
+    branches:
+      - '*' # Run on all branches
   pull_request:
-    branches: [ master ]
+    branches:
+      - '*' # Run on all branches
 
 jobs:
   build:
@@ -16,23 +18,22 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11", "pypy-3.7", "pypy-3.8", "pypy-3.9"]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install Tokenizer
       run: |
         python -m pip install --upgrade pip wheel setuptools
-        python -m pip install -e .
-    - name: Type check with mypy (only on Python 3.7)
+        python -m pip install -e ".[dev]"
+    - name: Type check with mypy (only on Python 3.8)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.7" ]; then python -m pip install mypy; fi
-        if [ "${{ matrix.python-version }}" == "3.7" ]; then mypy --python-version=3.7 src/tokenizer; fi
+        if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
+        if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
     - name: Test with pytest
       run: |
-        python -m pip install pytest
-        pytest
+        python -m pytest
diff --git a/.gitignore b/.gitignore
@@ -119,7 +119,7 @@ mypy.ini
 # Vim swap files
 .*.swp
 
-
+.DS_Store
 
 # Sveinbjörn's temp files
 test.py
diff --git a/LICENSE → LICENSE.txt b/LICENSE → LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (C) 2022 Miðeind ehf.
+Copyright (C) 2023 Miðeind ehf.
 Original author: Vilhjálmur Þorsteinsson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy

diff --git a/README.rst b/README.rst
@@ -2,7 +2,7 @@
 Tokenizer: A tokenizer for Icelandic text
 -----------------------------------------
 
-.. image:: https://github.com/mideind/Tokenizer/workflows/Python%20package/badge.svg
+.. image:: https://github.com/mideind/Tokenizer/workflows/tests/badge.svg
    :target: https://github.com/mideind/Tokenizer
 
 Overview
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
 tasks, such as word counting, parsing, spell checking, corpus generation, and
 statistical analysis of text.
 
-**Tokenizer** is a compact pure-Python (>= 3.7) executable
+**Tokenizer** is a compact pure-Python (>= 3.8) executable
 program and module for tokenizing Icelandic text. It converts input text to
 streams of *tokens*, where each token is a separate word, punctuation sign,
 number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -24,11 +24,11 @@ in the file ``src/tokenizer/Abbrev.conf``.
 
 Tokenizer is an independent spinoff from the `Greynir project <https://greynir.is>`_
 (GitHub repository `here <https://github.com/mideind/Greynir>`_), by the same authors.
-The `Greynir natural language parser for Icelandic <https://github.com/mideind/GreynirPackage>`_
+The `Greynir natural language parser for Icelandic <https://github.com/mideind/GreynirEngine>`_
 uses Tokenizer on its input.
 
 Note that Tokenizer is licensed under the *MIT* license
-while Greynir is licensed under *GPLv3*.
+while GreynirEngine is licensed under *GPLv3*.
 
 
 Deep vs. shallow tokenization
@@ -753,7 +753,7 @@ modify the source files (assuming you have ``git`` available):
     $ git clone https://github.com/mideind/Tokenizer
     $ cd Tokenizer
     $ # [ Activate your virtualenv here, if you have one ]
-    $ pip install -e .
+    $ pip install -e ".[dev]"
 
 
 Test suite
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
 * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.
 * Version 3.4.1: Improved performance on long input chunks.
 * Version 3.4.0: Improved handling and normalization of punctuation.

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,66 @@
+[project]
+name = "tokenizer"
+version = "3.4.3"
+description = "A tokenizer for Icelandic text"
+authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
+readme = { file = "README.rst", content-type = "text/x-rst" }
+license = { file = "LICENSE.txt" }
+# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: Unix",
+    "Operating System :: POSIX",
+    "Operating System :: Microsoft :: Windows",
+    "Natural Language :: Icelandic",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Utilities",
+    "Topic :: Text Processing :: Linguistic",
+]
+requires-python = ">=3.8"
+
+[project.urls]
+Repository = "https://github.com/mideind/Tokenizer"
+
+[project.optional-dependencies]
+# dev dependencies
+dev = ["pytest"]
+
+[project.scripts]
+# 'tokenize' command line tool
+tokenize = "tokenizer.main:main"
+
+# *** Configuration of tools ***
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+where = ["src"]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    # Ignore deprecation warnings in libraries, their problem not ours
+    "ignore::DeprecationWarning",
+]
+
+[tool.ruff]
+line-length = 120
+
+[tool.black]
+line-length = 120
+
+[tool.isort]
+# This forces these imports to placed at the top
+known_future_library = ["__future__", "typing", "typing_extensions"]
+profile = "black"
+line_length = 120
diff --git a/release.sh b/release.sh
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -26,6 +26,8 @@
 
 """
 
+import importlib.metadata
+
 from .definitions import (
     TP_LEFT,
     TP_CENTER,
@@ -59,10 +61,11 @@
     TokenStream,
 )
 from .abbrev import Abbreviations, ConfigError
-from .version import __version__
 
-__author__ = "Miðeind ehf"
-__copyright__ = "(C) 2022 Miðeind ehf."
+__author__ = "Miðeind ehf."
+__copyright__ = "(C) 2023 Miðeind ehf."
+__version__ = importlib.metadata.version("tokenizer")
+
 
 __all__ = (
     "__author__",
@@ -99,4 +102,3 @@
     "TP_RIGHT",
     "TP_WORD",
 )
-
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -71,12 +71,8 @@
 
 group = parser.add_mutually_exclusive_group()
 
-group.add_argument(
-    "--csv", help="Output one token per line in CSV format", action="store_true"
-)
-group.add_argument(
-    "--json", help="Output one token per line in JSON format", action="store_true"
-)
+group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true")
+group.add_argument("--json", help="Output one token per line in JSON format", action="store_true")
 
 parser.add_argument(
     "-s",
@@ -96,10 +92,7 @@
     "-p",
     "--coalesce_percent",
     action="store_true",
-    help=(
-        "Numbers combined into one token with percentage word forms "
-        "(prósent/prósentustig/hundraðshlutar)"
-    ),
+    help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"),
 )
 
 parser.add_argument(
@@ -134,10 +127,7 @@
     "-c",
     "--convert_numbers",
     action="store_true",
-    help=(
-        "English-style decimal points and thousands separators "
-        "in numbers changed to Icelandic style"
-    ),
+    help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"),
 )
 
 parser.add_argument(