ENH handle pre- vs post-clitics; support python 3.12

jacksonllee · Dec 13, 2023 · 7113f0d · 7113f0d
1 parent af80e3b
commit 7113f0d
Show file tree

Hide file tree

Showing 16 changed files with 391 additions and 560 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -11,7 +11,7 @@ jobs:
         type: string
     docker:
       # Pick the highest Python 3.x version that this package is known to support
-      - image: cimg/python:3.11
+      - image: cimg/python:3.12
         #auth:
         #  username: $DOCKERHUB_USERNAME
         #  password: $DOCKERHUB_PASSWORD
@@ -112,7 +112,7 @@ workflows:
             - bandit
           matrix:
             parameters:
-              python-version: ["3.8", "3.9", "3.10", "3.11"]
+              python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
       - build-python-win:
           requires:
             - flake8

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 ### Security
 
+## [0.19.0] - 2023-12-12
+
+### Added
+- Added support for Python 3.12.
+- Handled pre-clitics and post-clitics from %mor tiers
+  and honored their distinction in the parsed utterance.
+
 ## [0.18.0] - 2023-03-11
 
 ### Added

diff --git a/README.rst b/README.rst
@@ -13,6 +13,10 @@ Full documentation: https://pylangacq.org
    :target: https://pypi.python.org/pypi/pylangacq
    :alt: Supported Python versions
 
+.. image:: https://img.shields.io/pypi/dm/pylangacq
+   :target: https://pypi.python.org/pypi/pylangacq
+   :alt: PyPI - Downloads
+
 .. image:: https://circleci.com/gh/jacksonllee/pylangacq.svg?style=shield
    :target: https://circleci.com/gh/jacksonllee/pylangacq
    :alt: CircleCI Builds
@@ -42,12 +46,6 @@ To download and install the most recent version::
 Ready for more?
 Check out the `Quickstart <https://pylangacq.org/quickstart.html>`_ page.
 
-Support
--------
-
-If you have found PyLangAcq useful and would like to offer support,
-`buying me a coffee <https://www.buymeacoffee.com/pylangacq>`_ would go a long way!
-
 Links
 -----
 

diff --git a/docs/source/measures.rst b/docs/source/measures.rst
@@ -35,12 +35,12 @@ For the mean lengths of utterance (MLU) in Eve's data from Brown:
      3.840077071290944,
      3.822669104204753,
      3.8814317673378076,
-     4.176287051482059,
+     4.177847113884555,
      4.2631578947368425,
-     3.976890756302521,
+     3.9936974789915967,
      4.457182320441989,
      4.416536661466458,
      4.499446290143965,
-     4.289506953223768,
+     4.288242730720607,
      4.3813169984686064,
      3.3191094619666046]
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -89,7 +89,7 @@ Transcriptions and Annotations
 
     >>> words = eve.words()  # list of strings, for all the words across all 20 files
     >>> len(words)  # total word count
-    119799
+    119781
     >>> words[:8]
     ['more', 'cookie', '.', 'you', '0v', 'more', 'cookies', '?']
 
@@ -106,25 +106,25 @@ the method has the optional boolean parameter ``by_files``:
     >>> for words_one_file in words_by_files:
     ...     print(len(words_one_file))
     ...
-    5810
-    5258
-    2493
-    5742
+    5809
+    5252
+    2488
+    5739
     5707
     4338
-    5298
+    5299
     8901
     4454
-    4535
-    4196
-    6193
+    4533
+    4195
+    6195
     4444
-    5202
-    8075
+    5207
+    8073
     7361
     10870
-    8407
-    6903
+    8403
+    6901
     5612
 
 Apart from transcriptions, CHAT data has rich annotations for linguistic
@@ -194,7 +194,7 @@ and :func:`~pylangacq.Reader.word_ngrams`:
     >>> word_freq.most_common(5)
     [('.', 20071),
      ('?', 6358),
-     ('you', 3695),
+     ('you', 3681),
      ('the', 2524),
      ('it', 2363)]
 
@@ -228,13 +228,13 @@ To get the mean length of utterance (MLU), use :func:`~pylangacq.Reader.mlu`:
      3.840077071290944,
      3.822669104204753,
      3.8814317673378076,
-     4.176287051482059,
+     4.177847113884555,
      4.2631578947368425,
-     3.976890756302521,
+     3.9936974789915967,
      4.457182320441989,
      4.416536661466458,
      4.499446290143965,
-     4.289506953223768,
+     4.288242730720607,
      4.3813169984686064,
      3.3191094619666046]
 

diff --git a/docs/source/read.rst b/docs/source/read.rst
@@ -53,15 +53,15 @@ a :func:`~pylangacq.read_chat` call like this typically takes a couple seconds.
 
     >>> brown.info()
     214 files
-    184639 utterances
-    881179 words
+    184631 utterances
+    880748 words
           Utterance Count    Word Count  File Path
     --  -----------------  ------------  ---------------------
-    #1               1737          6441  Brown/Adam/020304.cha
-    #2               1972          7763  Brown/Adam/020318.cha
-    #3               1305          5579  Brown/Adam/020403.cha
+    #1               1737          6437  Brown/Adam/020304.cha
+    #2               1972          7757  Brown/Adam/020318.cha
+    #3               1305          5572  Brown/Adam/020403.cha
     #4               1224          4570  Brown/Adam/020415.cha
-    #5               1344          5480  Brown/Adam/020430.cha
+    #5               1344          5469  Brown/Adam/020430.cha
     ...
     (set `verbose` to True for all the files)
 
@@ -383,7 +383,7 @@ organized by a subdirectory structure.
 :func:`~pylangacq.Reader.filter` allows us to easily create :class:`~pylangacq.Reader` objects
 for individual children without re-downloading data:
 
-.. skip: start if(os.name == "nt", reason="Windows OS sep is backslash instead")
+.. skip: start if(os.getenv("CI") == "true", reason="test got killed at brown = pylangacq.read_chat(url) for python 3.12 on CircleCI?")
 
 .. code-block:: python
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,35 +4,25 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pylangacq"
-version = "0.18.0"
+version = "0.19.0"
 description = "Tools for Language Acquisition Research"
 readme = "README.rst"
 requires-python = ">= 3.8"
 license = { text = "MIT License" }
 authors = [ { name = "Jackson L. Lee", email = "jacksonlunlee@gmail.com" } ]
 keywords = [
-    "computational linguistics",
-    "natural language processing",
-    "NLP",
-    "linguistics",
-    "corpora",
-    "speech",
-    "language",
     "CHILDES",
     "TalkBank",
-    "CHAT",
-    "transcription",
-    "child language",
-    "language acquisition",
-    "language development",
+    "language-acquisition",
+    "language-development",
 ]
 dependencies = [
     'python-dateutil >= 2.0.0',
     'requests >= 2.18.0',
     'tabulate[widechars] >= 0.8.9',
 ]
 classifiers = [
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Information Technology",
@@ -41,10 +31,11 @@ classifiers = [
     "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Human Machine Interfaces",
@@ -62,22 +53,22 @@ Source = "https://github.com/jacksonllee/pylangacq"
 [project.optional-dependencies]
 dev = [
     # Running tests and linters
-    "black == 23.1.0",
-    "flake8 == 6.0.0",
-    "pytest == 7.2.1",
-    "sybil == 4.0.1",
+    "black == 23.11.0",
+    "flake8 == 6.1.0",
+    "pytest == 7.4.3",
+    "sybil == 6.0.2",
 
     # Managing source distributions
-    "build == 0.10.0",
+    "build == 1.0.3",
     "twine == 4.0.2",
 
     # Building the Sphinx docs
-    "furo == 2022.12.7",
+    "furo == 2023.9.10",
     "m2r == 0.3.1",
-    "numpydoc == 1.5.0",
-    "Sphinx == 6.1.3",
-    "sphinx-sitemap == 2.5.0",
-    "sphinx-copybutton == 0.5.1",
+    "numpydoc == 1.6.0",
+    "Sphinx == 7.1.2",
+    "sphinx-sitemap == 2.5.1",
+    "sphinx-copybutton == 0.5.2",
 ]
 
 [tool.setuptools]

diff --git a/scripts/check_talkbank_compatibility.py b/scripts/check_talkbank_compatibility.py
@@ -89,14 +89,15 @@
 
 def _check_compatibility(url: str, successes: int, failures: int) -> Tuple[int, int]:
     try:
-        pylangacq.read_chat(url)
-        successes += 1
+        pylangacq.Reader.from_zip(url, use_cached=False)
     except zipfile.BadZipFile:
         _LOG.warning("Can't reach this dataset: %r", url)
         failures += 1
     except:  # noqa
         _LOG.exception("Can't parse %r -> %r -> %r", db, corpus, dataset)
         failures += 1
+    else:
+        successes += 1
     return successes, failures
 
 

diff --git a/src/pylangacq/_clean_utterance.py b/src/pylangacq/_clean_utterance.py
@@ -91,7 +91,8 @@ def _skip_extract(utterance, regex, replacee) -> str:
         return utterance
 
 
-def _find_paren(s, target, opposite, direction) -> int:
+def _find_paren(utterance, check, target, opposite, direction) -> int:
+    s = utterance[:check]
     if direction == "left":
         indices = range(len(s) - 1, -1, -1)
     elif direction == "right":
@@ -109,14 +110,17 @@ def _find_paren(s, target, opposite, direction) -> int:
         if signal == 0:
             return i
     else:
-        raise ValueError(f"no matching paren: {s}, {target}, {opposite}, {direction}")
+        raise ValueError(
+            "no matching paren: "
+            f"{utterance} | {check} | {target} | {opposite} | {direction}"
+        )
 
 
 def _drop(utterance, test, target_paren, opposite_paren, paren_direction):
     check = utterance.find(test)
     if check != -1:
         paren_i = _find_paren(
-            utterance[:check], target_paren, opposite_paren, paren_direction
+            utterance, check, target_paren, opposite_paren, paren_direction
         )
         utterance = f"{utterance[: paren_i]} {utterance[check + len(test):]}"
         utterance = " ".join(utterance.split())
@@ -237,16 +241,22 @@ def _clean_utterance(utterance: str) -> str:
         "(...)",
         ":",
         ";",
+        ";;",
         "<",
         ">",
         # Drop the following for PhonBank later?
+        "xx",
+        "yy",
         "xxx",
         "yyy",
         "www",
+        "www:",
         "xxx:",
         "xxx;",
+        "xxx;;",
         "xxx→",
         "xxx↑",
+        "xxx@si",
         "yyy:",
         "→",
     }