From 15453a72c43ffc0fe60c14845e239255aa6bb75c Mon Sep 17 00:00:00 2001 From: "Jackson L. Lee" Date: Wed, 11 Sep 2019 16:10:12 -0500 Subject: [PATCH] Require Python 3.6+ and misc. repo maintenance (#9) * Update dev dependency versions * Switch to CircleCI * Update setup.py for python 3.6+ * Convert README from rst to markdown * No need for Python 2+3 cross compatible code anymore * Style conforming to black/white * Add notes for development and contribution * Version CONTRIBUTING.md for real * A more robust .gitignore * Update changelog --- .circleci/config.yml | 54 ++ .gitignore | 6 +- .travis.yml | 23 - CHANGELOG.md | 13 +- CONTRIBUTING.md | 72 +++ MANIFEST.in | 4 - README.md | 92 +++ README.rst | 82 --- dev-requirements.txt | 13 - pylangacq/__init__.py | 4 +- pylangacq/_version.py | 2 +- pylangacq/chat.py | 916 +++++++++++++++++---------- pylangacq/compat.py | 22 - pylangacq/dependency.py | 59 +- pylangacq/measures.py | 958 ++++++++++++++++------------- pylangacq/tests/test_chat.py | 373 ++++++----- pylangacq/tests/test_dependency.py | 32 +- pylangacq/tests/test_util.py | 141 +++-- pylangacq/tests/test_version.py | 13 +- pylangacq/util.py | 249 ++++---- requirements.txt | 14 + setup.cfg | 7 - setup.py | 117 ++-- 23 files changed, 1884 insertions(+), 1382 deletions(-) create mode 100644 .circleci/config.yml delete mode 100644 .travis.yml create mode 100644 CONTRIBUTING.md delete mode 100644 MANIFEST.in create mode 100644 README.md delete mode 100644 README.rst delete mode 100644 dev-requirements.txt delete mode 100644 pylangacq/compat.py create mode 100644 requirements.txt diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..f90ded0 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,54 @@ +version: 2 + +workflows: + version: 2 + test: + jobs: + - build-python-3.6 + - build-python-3.7 + - build-python-3.8 + +jobs: + build-python-3.6: &template + docker: + - image: python:3.6 + steps: + - checkout + - run: + name: Build source distribution and install package from it + working_directory: ~/project/ + # Test that we can build a source distribution that can correctly + # install from clean slate. + # "python setup.py sdist" creates dist/pylangacq-x.y.z.tar.gz + command: | + pip install --progress-bar off --upgrade pip setuptools + python setup.py sdist + pip install dist/`ls dist/ | grep .tar.gz` + - run: + name: Install the full development requirements + working_directory: ~/project/ + command: pip install --progress-bar off -r requirements.txt + - run: + name: Show installed Python packages + command: pip list + - run: + name: Run linter + working_directory: ~/ + # Avoid being able to do relative imports. + # Test code by importing the *installed* library in site-packages. + command: flake8 project/setup.py project/pylangacq + - run: + name: Run python tests + working_directory: ~/ + # Avoid being able to do relative imports. + # Test code by importing the *installed* library in site-packages. + command: pytest -vv --cov=project/pylangacq project/pylangacq + build-python-3.7: + <<: *template + docker: + - image: python:3.7 + build-python-3.8: + <<: *template + docker: + # TODO: Switch to python:3.8 after Python 3.8 is out in Oct/Nov 2019. + - image: python:3.8-rc diff --git a/.gitignore b/.gitignore index b0cb402..b80b619 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ +*.egg-info/ +.pytest_cache/ +dist/ + docs/_sources/* # Test data Brown/* -brown.zip \ No newline at end of file +brown.zip diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6c1bc9d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: python - -python: - - "3.5" - - "3.6" - -# xenial and sudo workaround currently required for 3.7 on Travis, -# see: https://github.com/travis-ci/travis-ci/issues/9815 -# Enable 3.7 without globally enabling sudo and dist: xenial for other build jobs -matrix: - include: - - python: 3.7 - dist: xenial - sudo: true - -install: - - pip install --upgrade pip setuptools - - pip install -r dev-requirements.txt - - pip install --no-deps -e . - -script: - - flake8 pylangacq - - py.test -rxXs -vv --cov pylangacq pylangacq diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b8952..d956942 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,19 +8,20 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added -* Support Python 3.7; turn on Travis CI builds for this Python version. (#7) +* Started testing Python 3.7 and 3.8 on continuous integration. (#9) * Add time marker support (available at `_SingleReader`), originally contributed at #3 by @hellolzc. (#8) ### Changed -* Remove conversational quotes in utterance processing; updated test CHAT file +* Switched from Travis CI to CircleCI for autobuilds. (#9) +* Switched README from reStructuredText to Markdown. (#9) +* Removed conversational quotes in utterance processing; updated test CHAT file to match the latest CHILDES data. (#7) -### Deprecated -* Python < 3.5. If used, `DeprecationWarning` is raised during - installation. (#7) - ### Removed +* Dropped support for Python 2.7, 3.4, and 3.5. + All code related to Python 2+3 cross compatibility was removed. (#9) + ### Fixed ### Security diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3ee8518 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,72 @@ +# Contributing + +Thank you for your interest in contributing to the `pylangacq` codebase! + +This page assumes that you have already created a fork of the `pylangacq` repo +under your GitHub account and have the codebase available locally for +development work. If you have followed +[these steps](https://github.com/jacksonllee/pylangacq#development), +then you are all set. + +## Working on a Feature or Bug Fix + +The development steps below assumes that your local Git repo has a remote +`upstream` link to `jacksonllee/pylangacq`: + +```bash +git remote add upstream https://github.com/jacksonllee/pylangacq.git +``` + +After this step (which you only have to do once), +running `git remote -v` should show your local Git repo +has links to both "origin" (pointing to your fork `/pylangacq`) +and "upstream" (pointing to `jacksonllee/pylangacq`). + +To work on a feature or bug fix, here are the development steps: + +1. Before doing any work, check out the master branch and + make sure that your local master branch is up-to-date with upstream master: + + ```bash + git checkout master + git pull upstream master + ``` + +2. Create a new branch. This branch is where you will make commits of your work. + (As best practice, never make commits while on a master branch. + Running `git branch` tells you which branch you are on.) + + ```bash + git checkout -b new-branch-name + ``` + +3. Make as many commits as needed for your work. + +4. When you feel your work is ready for a pull request, + push your branch to your fork. + + ```bash + git push origin new-branch-name + ``` + +5. Go to your fork `https://github.com//pylangacq` and + create a pull request off of your branch against the `jacksonllee/pylangacq` repo. + +## Running Tests + +The `pylangacq` repo has continuous integration (CI) turned on, +with autobuilds running pytest and flake8 for the test suite +(in the [`pylangacq/tests/`](pylangacq/tests) directory) +and code style checks, respectively. +If an autobuild at a pending pull request fails because of pytest or flake8 +errors, then the errors must be fixed by further commits pushed to the branch +by the author. + +If you would like to help avoid wasting free Internet resources +(every push triggers new CI autobuilds), +you can run pytest and flake8 checks locally before pushing commits: + +```bash +flake8 setup.py pylangacq +pytest -vv --cov=pylangacq pylangacq +``` diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index b322dd2..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include README.rst -include CHANGELOG.md -include LICENSE.txt -include dev-requirements.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..79546f6 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +# PyLangAcq + +[![PyPI version](https://badge.fury.io/py/pylangacq.svg)](https://pypi.org/project/pylangacq) +[![Supported Python versions](https://img.shields.io/pypi/pyversions/pylangacq.svg)](https://pypi.org/project/pylangacq) +[![CircleCI](https://circleci.com/gh/jacksonllee/pylangacq/tree/master.svg?style=svg)](https://circleci.com/gh/jacksonllee/pylangacq/tree/master) + +PyLangAcq is a Python library for language acquisition research. +It allows flexible handling of the CHILDES data. + +Full documentation: http://pylangacq.org/ + +## Features + +- Comprehensive capabilities of handling CHAT transcripts as used in CHILDES +- Intuitive data structures for flexible data access and all sorts of modeling work +- Standard developmental measures such as TTR, MLU, and IPSyn readily available +- More benefits from Python: fast coding, numerous libraries + for computational modeling and machine learning +- Powerful extensions for research with conversational data in general + +## Download and install + +PyLangAcq is available via `pip`: + +```bash +pip install -U pylangacq +``` + +PyLangAcq works with Python 3.6 or above. + +## Development + +The source code of PyLangAcq is hosted on GitHub at +https://github.com/jacksonllee/pylangacq, +where development also happens. + +For the latest changes not yet released through `pip` or working on the codebase +yourself, you may obtain the latest source code through GitHub and `git`: + +1. Create a fork of the `pylangacq` repo under your GitHub account. +2. Locally, make sure you are in some sort of a virtual environment + (venv, virtualenv, conda, etc). +3. Download and install the library in the "editable" mode + together with the core and dev dependencies within the virtual environment: + + ```bash + git clone https://github.com//pylangacq.git + cd pylangacq + pip install --upgrade pip setuptools + pip install -r requirements.txt + pip install -e . + ``` + +We keep track of notable changes in +[CHANGELOG.md](https://github.com/jacksonllee/pylangacq/blob/master/CHANGELOG.md). + +## Contribution + +For questions, bug reports, and feature requests, +please [file an issue](https://github.com/jacksonllee/pylangacq/issues). + +If you would like to contribute to the `pylangacq` codebase, +please see +[CONTRIBUTING.md](https://github.com/jacksonllee/pylangacq/blob/master/CONTRIBUTING.md). + +## How to Cite + +PyLangAcq is maintained by [Jackson Lee](http://jacksonllee.com/). +If you use PyLangAcq in your research, please cite the following: + +Lee, Jackson L., Ross Burkholder, Gallagher B. Flinn, and Emily R. Coppess. 2016. +[Working with CHAT transcripts in Python](http://jacksonllee.com/papers/lee-etal-2016-pylangacq.pdf). +Technical report [TR-2016-02](http://www.cs.uchicago.edu/research/publications/techreports/TR-2016-02), +Department of Computer Science, University of Chicago. + +```bibtex +@TechReport{lee-et-al-pylangacq:2016, + Title = {Working with CHAT transcripts in Python}, + Author = {Lee, Jackson L. and Burkholder, Ross and Flinn, Gallagher B. and Coppess, Emily R.}, + Institution = {Department of Computer Science, University of Chicago}, + Year = {2016}, + Number = {TR-2016-02}, +} +``` + +## License + +The MIT License; please see [LICENSE.txt](https://github.com/jacksonllee/pylangacq/blob/master/LICENSE.txt). +The test data files included +have a [CC BY-NC-SA 3.0](https://creativecommons.org/licenses/by-nc-sa/3.0/) +license instead; please also see +[`pylangacq/tests/test_data/README.md`](https://github.com/jacksonllee/pylangacq/blob/master/pylangacq/tests/test_data/README.md). diff --git a/README.rst b/README.rst deleted file mode 100644 index fafb05f..0000000 --- a/README.rst +++ /dev/null @@ -1,82 +0,0 @@ -PyLangAcq -========= - -.. image:: https://badge.fury.io/py/pylangacq.svg - :target: https://pypi.python.org/pypi/pylangacq - :alt: PyPI version - -.. image:: https://img.shields.io/pypi/pyversions/pylangacq.svg - :target: https://pypi.python.org/pypi/pylangacq - :alt: Supported Python versions - -.. image:: https://travis-ci.org/pylangacq/pylangacq.svg?branch=master - :target: https://travis-ci.org/pylangacq/pylangacq - :alt: Build - - -PyLangAcq is a Python library for language acquisition research. -It allows flexible handling of the CHILDES data. - -Full documentation: http://pylangacq.org/ - - -Features --------- - -* Comprehensive capabilities of handling CHAT transcripts - as used in CHILDES -* Intuitive data structures for flexible data access and all sorts of modeling work -* Standard developmental measures such as TTR, MLU, and IPSyn readily available -* More benefits from Python: fast coding, numerous libraries for computational - modeling and machine learning -* Powerful extensions for research with conversational data in general - - -Download and install --------------------- - -PyLangAcq is available via `pip`: - -.. code-block:: bash - - $ pip install -U pylangacq - -PyLangAcq works with Python 3.5+. -(Usage with Python 2.7 and 3.4 is deprecated starting from PyLangAcq v0.11.0.) - - -How to cite ------------ - -PyLangAcq is maintained by `Jackson Lee `_. -If you use PyLangAcq in your research, -please cite the following: - -Lee, Jackson L., Ross Burkholder, Gallagher B. Flinn, and Emily R. Coppess. 2016. -`Working with CHAT transcripts in Python `_. -Technical report `TR-2016-02 `_, -Department of Computer Science, University of Chicago. - -.. code-block:: bash - - @TechReport{lee-et-al-pylangacq:2016, - Title = {Working with CHAT transcripts in Python}, - Author = {Lee, Jackson L. and Burkholder, Ross and Flinn, Gallagher B. and Coppess, Emily R.}, - Institution = {Department of Computer Science, University of Chicago}, - Year = {2016}, - Number = {TR-2016-02}, - } - - -Change log ----------- - -See `CHANGELOG.md `_. - - -License -------- - -The MIT License. Please see `LICENSE.txt `_. -The test data files included have a `CC BY-NC-SA 3.0 `_ -license instead -- please see ``pylangacq/tests/test_data/README.md``. diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index 360e4cd..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -# For tests -flake8==3.7.7 -pytest==4.5.0 -pytest-cov==2.7.1 -requests==2.22.0 -# -# For docs -alabaster==0.7.12 -Sphinx==2.0.1 -numpydoc==0.9.1 -# -# For PyPI releases -twine==1.13.0 \ No newline at end of file diff --git a/pylangacq/__init__.py b/pylangacq/__init__.py index b389c86..28e47cd 100644 --- a/pylangacq/__init__.py +++ b/pylangacq/__init__.py @@ -1,5 +1,5 @@ from pylangacq.chat import read_chat, Reader -from pylangacq._version import __version__ # noqa +from pylangacq._version import __version__ -__all__ = ['read_chat', 'Reader'] +__all__ = ["__version__", "read_chat", "Reader"] diff --git a/pylangacq/_version.py b/pylangacq/_version.py index d533f54..8806bd5 100644 --- a/pylangacq/_version.py +++ b/pylangacq/_version.py @@ -1,2 +1,2 @@ # this script is executed by setup.py -__version__ = '0.10.0' +__version__ = "0.10.0" diff --git a/pylangacq/chat.py b/pylangacq/chat.py index 73cc673..9e93362 100644 --- a/pylangacq/chat.py +++ b/pylangacq/chat.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """Interfacing with CHAT data files.""" import sys @@ -15,11 +13,16 @@ from functools import wraps from pylangacq.measures import get_MLUm, get_MLUw, get_TTR, get_IPSyn -from pylangacq.util import (ENCODING, CLITIC, - get_participant_code, convert_date_to_tuple, - clean_utterance, clean_word, get_lemma_from_mor, - get_time_marker) -from pylangacq.compat import open, unicode_, OPEN_MODE, FileNotFoundError +from pylangacq.util import ( + ENCODING, + CLITIC, + get_participant_code, + convert_date_to_tuple, + clean_utterance, + clean_word, + get_lemma_from_mor, + get_time_marker, +) _TEMP_DIR = tempfile.mkdtemp() @@ -51,23 +54,23 @@ def read_chat(*filenames, **kwargs): def params_in_docstring(*params): - docstring = '' - if 'participant' in params: + docstring = "" + if "participant" in params: docstring += """ participant : str or iterable of str, optional Participants of interest. If unspecified or ``None``, all participants are included.""" - if 'exclude' in params: + if "exclude" in params: docstring += """ exclude : str or iterable of str, optional Participants to exclude. If unspecified or ``None``, no participants are excluded.""" - if 'by_files' in params: + if "by_files" in params: docstring += """ by_files : bool, optional If ``True``, return dict(absolute-path filename: X for that file) instead of X for all files altogether.""" - if 'keep_case' in params: + if "keep_case" in params: docstring += """ keep_case : bool, optional If ``True`` (the default), case distinctions are kept, e.g., @@ -75,14 +78,17 @@ def params_in_docstring(*params): If ``False``, all word tokens are forced to be in lowercase.""" def real_decorator(func): - returns_header = '\n\n Returns\n -------' - func.__doc__ = func.__doc__.replace(returns_header, - docstring + returns_header) + returns_header = "\n\n Returns\n -------" + func.__doc__ = func.__doc__.replace( + returns_header, docstring + returns_header + ) @wraps(func) def wrapper(*args, **kwargs): return func(*args, **kwargs) + return wrapper + return real_decorator @@ -103,8 +109,9 @@ class Reader(object): Only the keyword ``encoding`` is recognized, which defaults to 'utf8'. (New in version 0.9) """ + def __init__(self, *filenames, **kwargs): - self.encoding = kwargs.get('encoding', ENCODING) + self.encoding = kwargs.get("encoding", ENCODING) self._input_filenames = filenames self._reset_reader(*self._input_filenames) @@ -125,7 +132,7 @@ def from_chat_str(cls, chat_str, encoding=ENCODING): Reader """ file_path = os.path.join(_TEMP_DIR, str(uuid.uuid4())) - with open(file_path, mode='w', encoding=encoding) as f: + with open(file_path, mode="w", encoding=encoding) as f: f.write(chat_str) return cls(file_path, encoding=encoding) @@ -162,42 +169,47 @@ def from_chat_files(cls, *filenames, **kwargs): @staticmethod def _get_abs_filenames(*filenames): """Return the set of absolute-path filenames based on filenames.""" - if sys.platform.startswith('win'): + if sys.platform.startswith("win"): windows = True # pragma: no cover else: windows = False filenames_set = set() for filename in filenames: - if not isinstance(filename, (str, unicode_)): - raise ValueError('{} is not str'.format(repr(filename))) + if not isinstance(filename, str): + raise ValueError("{} is not str".format(repr(filename))) if windows: - filename = filename.replace('/', os.sep) # pragma: no cover + filename = filename.replace("/", os.sep) # pragma: no cover else: - filename = filename.replace('\\', os.sep) + filename = filename.replace("\\", os.sep) abs_fullpath = os.path.abspath(filename) abs_dir = os.path.dirname(abs_fullpath) - glob_match_pattern = re.compile(r'.*[\*\?\[\]].*') + glob_match_pattern = re.compile(r".*[\*\?\[\]].*") while glob_match_pattern.search(abs_dir): # pragma: no cover abs_dir = os.path.dirname(abs_dir) if not os.path.isdir(abs_dir): # pragma: no cover - msg = (u'{} is not a directory. Filename {} is likely invalid.' - .format(abs_dir, filename)) + msg = ( + f"{abs_dir} is not a directory. " + f"Filename {filename} is likely invalid." + ) raise ValueError(msg) - candidate_filenames = [os.path.join(dir_, fn) - for dir_, _, fns in os.walk(abs_dir) - for fn in fns] + candidate_filenames = [ + os.path.join(dir_, fn) + for dir_, _, fns in os.walk(abs_dir) + for fn in fns + ] - filenames_set.update(fnmatch.filter(candidate_filenames, - abs_fullpath)) + filenames_set.update( + fnmatch.filter(candidate_filenames, abs_fullpath) + ) return filenames_set def _reset_reader(self, *filenames, **kwargs): - check = kwargs.get('check', True) + check = kwargs.get("check", True) filenames_set = set() if not check: @@ -211,8 +223,9 @@ def _reset_reader(self, *filenames, **kwargs): self._fname_to_reader = {} for fn in self._filenames: # TODO rewrite what _SingleReader takes as args - self._fname_to_reader[fn] = _SingleReader(fn, - encoding=self.encoding) + self._fname_to_reader[fn] = _SingleReader( + fn, encoding=self.encoding + ) def __len__(self): """Return the number of files. @@ -240,8 +253,12 @@ def filenames(self, sorted_by_age=False): return self._filenames else: # sort by filename first (so filenames with same age are sorted) - return [fn for fn, _ in - sorted(sorted(self.age().items()), key=lambda x: x[1])] + return [ + fn + for fn, _ in sorted( + sorted(self.age().items()), key=lambda x: x[1] + ) + ] def number_of_files(self): """Return the number of files. @@ -252,9 +269,10 @@ def number_of_files(self): """ return len(self) - @params_in_docstring('participant', 'exclude', 'by_files') - def number_of_utterances(self, participant=None, exclude=None, - by_files=False): + @params_in_docstring("participant", "exclude", "by_files") + def number_of_utterances( + self, participant=None, exclude=None, by_files=False + ): """Return the number of utterances for *participant* in all files. Parameters @@ -265,13 +283,19 @@ def number_of_utterances(self, participant=None, exclude=None, int or dict(str: int) """ if by_files: - return {fn: self._fname_to_reader[fn].number_of_utterances( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].number_of_utterances( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: - return sum(self._fname_to_reader[fn].number_of_utterances( - participant=participant, exclude=exclude) - for fn in self._filenames) + return sum( + self._fname_to_reader[fn].number_of_utterances( + participant=participant, exclude=exclude + ) + for fn in self._filenames + ) def headers(self): """Return a dict mapping a file path to the headers of that file. @@ -280,8 +304,9 @@ def headers(self): ------- dict(str: dict) """ - return {fn: self._fname_to_reader[fn].headers() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].headers() for fn in self._filenames + } def index_to_tiers(self): """Return a dict mapping a file path to the file's index_to_tiers dict. @@ -290,8 +315,10 @@ def index_to_tiers(self): ------- dict(str: dict) """ - return {fn: self._fname_to_reader[fn].index_to_tiers() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].index_to_tiers() + for fn in self._filenames + } def participants(self): """Return a dict mapping a file path to the file's participant info. @@ -300,10 +327,12 @@ def participants(self): ------- dict(str: dict) """ - return {fn: self._fname_to_reader[fn].participants() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].participants() + for fn in self._filenames + } - @params_in_docstring('by_files') + @params_in_docstring("by_files") def participant_codes(self, by_files=False): """Return the participant codes (e.g., ``{'CHI', 'MOT'}``). @@ -315,8 +344,10 @@ def participant_codes(self, by_files=False): set(str) or dict(str: set(str)) """ if by_files: - return {fn: self._fname_to_reader[fn].participant_codes() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].participant_codes() + for fn in self._filenames + } else: output_set = set() for fn in self._filenames: @@ -331,8 +362,9 @@ def languages(self): ------- dict(str: list(str)) """ - return {fn: self._fname_to_reader[fn].languages() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].languages() for fn in self._filenames + } def dates_of_recording(self): """Return a map from a file path to the date of recording. @@ -343,8 +375,10 @@ def dates_of_recording(self): ------- dict(str: list(tuple(int, int, int))) """ - return {fn: self._fname_to_reader[fn].dates_of_recording() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].dates_of_recording() + for fn in self._filenames + } def date_of_birth(self): """Return a map from a file path to the date of birth. @@ -353,10 +387,12 @@ def date_of_birth(self): ------- dict(str: dict(str: tuple(int, int, int))) """ - return {fn: self._fname_to_reader[fn].date_of_birth() - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].date_of_birth() + for fn in self._filenames + } - def age(self, participant='CHI', months=False): + def age(self, participant="CHI", months=False): """Return a map from a file path to the *participant*'s age. The age is in the form of (years, months, days). @@ -372,8 +408,12 @@ def age(self, participant='CHI', months=False): ------- dict(str: tuple(int, int, int)) or dict(str: float) """ - return {fn: self._fname_to_reader[fn].age( - participant=participant, months=months) for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].age( + participant=participant, months=months + ) + for fn in self._filenames + } def abspath(self, basename): """Return the absolute path of ``basename``. @@ -392,11 +432,12 @@ def abspath(self, basename): if os.path.basename(file_path) == basename: return file_path else: - raise ValueError('No such file.') + raise ValueError("No such file.") - @params_in_docstring('participant', 'exclude', 'by_files') - def utterances(self, participant=None, exclude=None, clean=True, - by_files=False): + @params_in_docstring("participant", "exclude", "by_files") + def utterances( + self, participant=None, exclude=None, clean=True, by_files=False + ): """Return a list of (*participant*, utterance) pairs from all files. Parameters @@ -409,20 +450,26 @@ def utterances(self, participant=None, exclude=None, clean=True, list(str) or dict(str: list(str)) """ if by_files: - return {fn: self._fname_to_reader[fn].utterances( - participant=participant, exclude=exclude, clean=clean) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].utterances( + participant=participant, exclude=exclude, clean=clean + ) + for fn in self._filenames + } else: return list( chain.from_iterable( self._fname_to_reader[fn].utterances( - participant=participant, exclude=exclude, clean=clean) - for fn in sorted(self._filenames)) + participant=participant, exclude=exclude, clean=clean + ) + for fn in sorted(self._filenames) + ) ) - @params_in_docstring('participant', 'exclude', 'keep_case', 'by_files') - def word_frequency(self, participant=None, exclude=None, keep_case=True, - by_files=False): + @params_in_docstring("participant", "exclude", "keep_case", "by_files") + def word_frequency( + self, participant=None, exclude=None, keep_case=True, by_files=False + ): """Return a word frequency counter for *participant* in all files. Parameters @@ -433,19 +480,27 @@ def word_frequency(self, participant=None, exclude=None, keep_case=True, Counter, or dict(str: Counter) """ if by_files: - return {fn: self._fname_to_reader[fn].word_frequency( - participant=participant, exclude=exclude, keep_case=keep_case) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].word_frequency( + participant=participant, + exclude=exclude, + keep_case=keep_case, + ) + for fn in self._filenames + } else: output_counter = Counter() for fn in self._filenames: output_counter.update( self._fname_to_reader[fn].word_frequency( - participant=participant, exclude=exclude, - keep_case=keep_case)) + participant=participant, + exclude=exclude, + keep_case=keep_case, + ) + ) return output_counter - @params_in_docstring('participant', 'exclude', 'by_files') + @params_in_docstring("participant", "exclude", "by_files") def words(self, participant=None, exclude=None, by_files=False): """Return a list of words by *participant* in all files. @@ -457,18 +512,23 @@ def words(self, participant=None, exclude=None, by_files=False): list(str) or dict(str: list(str)) """ if by_files: - return {fn: self._fname_to_reader[fn].words( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].words( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: return list( chain.from_iterable( self._fname_to_reader[fn].words( - participant=participant, exclude=exclude) - for fn in sorted(self._filenames)) + participant=participant, exclude=exclude + ) + for fn in sorted(self._filenames) + ) ) - @params_in_docstring('participant', 'exclude', 'by_files') + @params_in_docstring("participant", "exclude", "by_files") def tagged_words(self, participant=None, exclude=None, by_files=False): """Return a list of tagged words by *participant* in all files. @@ -480,18 +540,23 @@ def tagged_words(self, participant=None, exclude=None, by_files=False): list(tuple) or dict(str: list(tuple)) """ if by_files: - return {fn: self._fname_to_reader[fn].tagged_words( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].tagged_words( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: return list( chain.from_iterable( self._fname_to_reader[fn].tagged_words( - participant=participant, exclude=exclude) - for fn in sorted(self._filenames)) + participant=participant, exclude=exclude + ) + for fn in sorted(self._filenames) + ) ) - @params_in_docstring('participant', 'exclude', 'by_files') + @params_in_docstring("participant", "exclude", "by_files") def sents(self, participant=None, exclude=None, by_files=False): """Return a list of sents by *participant* in all files. @@ -503,18 +568,23 @@ def sents(self, participant=None, exclude=None, by_files=False): list(list(str)) or dict(str: list(list(str))) """ if by_files: - return {fn: self._fname_to_reader[fn].sents( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].sents( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: return list( chain.from_iterable( self._fname_to_reader[fn].sents( - participant=participant, exclude=exclude) - for fn in sorted(self._filenames)) + participant=participant, exclude=exclude + ) + for fn in sorted(self._filenames) + ) ) - @params_in_docstring('participant', 'exclude', 'by_files') + @params_in_docstring("participant", "exclude", "by_files") def tagged_sents(self, participant=None, exclude=None, by_files=False): """Return a list of tagged sents by *participant* in all files. @@ -526,20 +596,26 @@ def tagged_sents(self, participant=None, exclude=None, by_files=False): list(list(tuple)) or dict(str: list(list(tuple))) """ if by_files: - return {fn: self._fname_to_reader[fn].tagged_sents( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].tagged_sents( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: return list( chain.from_iterable( self._fname_to_reader[fn].tagged_sents( - participant=participant, exclude=exclude) - for fn in sorted(self._filenames)) + participant=participant, exclude=exclude + ) + for fn in sorted(self._filenames) + ) ) - @params_in_docstring('participant', 'exclude', 'by_files') - def part_of_speech_tags(self, participant=None, exclude=None, - by_files=False): + @params_in_docstring("participant", "exclude", "by_files") + def part_of_speech_tags( + self, participant=None, exclude=None, by_files=False + ): """Return the part-of-speech tags in the data for *participant*. Parameters @@ -550,14 +626,21 @@ def part_of_speech_tags(self, participant=None, exclude=None, set or dict(str: set) """ if by_files: - return {fn: self._fname_to_reader[fn].part_of_speech_tags( - participant=participant, exclude=exclude) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].part_of_speech_tags( + participant=participant, exclude=exclude + ) + for fn in self._filenames + } else: - return set().union(*( - self._fname_to_reader[fn].part_of_speech_tags( - participant=participant, exclude=exclude) - for fn in self._filenames)) + return set().union( + *( + self._fname_to_reader[fn].part_of_speech_tags( + participant=participant, exclude=exclude + ) + for fn in self._filenames + ) + ) def update(self, reader): """Combine the current CHAT Reader instance with ``reader``. @@ -569,7 +652,7 @@ def update(self, reader): if type(reader) is Reader: add_filenames = reader.filenames() else: - raise ValueError('invalid reader') + raise ValueError("invalid reader") new_filenames = add_filenames | self.filenames() self._reset_reader(*tuple(new_filenames), check=False) @@ -584,7 +667,7 @@ def add(self, *filenames): """ add_filenames = self._get_abs_filenames(*filenames) if not add_filenames: - raise ValueError('No files to add!') + raise ValueError("No files to add!") new_filenames = self.filenames() | add_filenames self._reset_reader(*tuple(new_filenames), check=False) @@ -598,12 +681,12 @@ def remove(self, *filenames): """ remove_filenames = self._get_abs_filenames(*filenames) if not remove_filenames: - raise ValueError('No files to remove!') + raise ValueError("No files to remove!") new_filenames = set(self.filenames()) for remove_filename in remove_filenames: if remove_filename not in self.filenames(): - raise ValueError('filename not found') + raise ValueError("filename not found") else: new_filenames.remove(remove_filename) @@ -613,9 +696,10 @@ def clear(self): """Clear everything and reset as an empty Reader instance.""" self._reset_reader() - @params_in_docstring('participant', 'exclude', 'keep_case', 'by_files') - def word_ngrams(self, n, participant=None, exclude=None, keep_case=True, - by_files=False): + @params_in_docstring("participant", "exclude", "keep_case", "by_files") + def word_ngrams( + self, n, participant=None, exclude=None, keep_case=True, by_files=False + ): """Return a word ``n``-gram counter by ``participant`` in all files. Returns @@ -623,20 +707,29 @@ def word_ngrams(self, n, participant=None, exclude=None, keep_case=True, Counter, or dict(str: Counter) """ if by_files: - return {fn: self._fname_to_reader[fn].word_ngrams( - n, participant=participant, exclude=exclude, - keep_case=keep_case) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].word_ngrams( + n, + participant=participant, + exclude=exclude, + keep_case=keep_case, + ) + for fn in self._filenames + } else: output_counter = Counter() for fn in self._filenames: output_counter.update( self._fname_to_reader[fn].word_ngrams( - n, participant=participant, exclude=exclude, - keep_case=keep_case)) + n, + participant=participant, + exclude=exclude, + keep_case=keep_case, + ) + ) return output_counter - def MLU(self, participant='CHI'): + def MLU(self, participant="CHI"): """Return a map from a file path to the file's MLU by morphemes. MLU = mean length of utterance. This method is identical to ``MLUm``. @@ -650,10 +743,12 @@ def MLU(self, participant='CHI'): ------- dict(str: float) """ - return {fn: self._fname_to_reader[fn].MLU( - participant=participant) for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].MLU(participant=participant) + for fn in self._filenames + } - def MLUm(self, participant='CHI'): + def MLUm(self, participant="CHI"): """Return a map from a file path to the file's MLU by morphemes. MLU = mean length of utterance. This method is identical to ``MLUm``. @@ -667,10 +762,12 @@ def MLUm(self, participant='CHI'): ------- dict(str: float) """ - return {fn: self._fname_to_reader[fn].MLUm( - participant=participant) for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].MLUm(participant=participant) + for fn in self._filenames + } - def MLUw(self, participant='CHI'): + def MLUw(self, participant="CHI"): """Return a map from a file path to the file's MLU by words. MLU = mean length of utterance. @@ -684,10 +781,12 @@ def MLUw(self, participant='CHI'): ------- dict(str: float) """ - return {fn: self._fname_to_reader[fn].MLUw( - participant=participant) for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].MLUw(participant=participant) + for fn in self._filenames + } - def TTR(self, participant='CHI'): + def TTR(self, participant="CHI"): """Return a map from a file path to the file's TTR. TTR = type-token ratio @@ -701,10 +800,12 @@ def TTR(self, participant='CHI'): ------- dict(str: float) """ - return {fn: self._fname_to_reader[fn].TTR( - participant=participant) for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].TTR(participant=participant) + for fn in self._filenames + } - def IPSyn(self, participant='CHI'): + def IPSyn(self, participant="CHI"): """Return a map from a file path to the file's IPSyn. IPSyn = index of productive syntax @@ -718,14 +819,23 @@ def IPSyn(self, participant='CHI'): ------- dict(str: int) """ - return {fn: self._fname_to_reader[fn].IPSyn( - participant=participant) for fn in self._filenames} - - @params_in_docstring('participant', 'exclude', 'by_files') - def search(self, search_item, participant=None, exclude=None, - match_entire_word=True, lemma=False, - output_tagged=True, output_sents=True, - by_files=False): + return { + fn: self._fname_to_reader[fn].IPSyn(participant=participant) + for fn in self._filenames + } + + @params_in_docstring("participant", "exclude", "by_files") + def search( + self, + search_item, + participant=None, + exclude=None, + match_entire_word=True, + lemma=False, + output_tagged=True, + output_sents=True, + by_files=False, + ): """Return a list of elements containing *search_item* by *participant*. Parameters @@ -750,23 +860,44 @@ def search(self, search_item, participant=None, exclude=None, list or dict(str: list) """ if by_files: - return {fn: self._fname_to_reader[fn].search( - search_item, participant=participant, exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma, - output_tagged=output_tagged, output_sents=output_sents) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].search( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + output_tagged=output_tagged, + output_sents=output_sents, + ) + for fn in self._filenames + } else: output_list = [] for fn in self.filenames(sorted_by_age=True): - output_list.extend(self._fname_to_reader[fn].search( - search_item, participant=participant, exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma, - output_tagged=output_tagged, output_sents=output_sents)) + output_list.extend( + self._fname_to_reader[fn].search( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + output_tagged=output_tagged, + output_sents=output_sents, + ) + ) return output_list - @params_in_docstring('participant', 'exclude', 'by_files') - def concordance(self, search_item, participant=None, exclude=None, - match_entire_word=True, lemma=False, by_files=False): + @params_in_docstring("participant", "exclude", "by_files") + def concordance( + self, + search_item, + participant=None, + exclude=None, + match_entire_word=True, + lemma=False, + by_files=False, + ): """Return a list of utterances with *search_item* for *participant*. All strings are aligned for *search_item* by space @@ -787,16 +918,28 @@ def concordance(self, search_item, participant=None, exclude=None, list, or dict(str: list) """ if by_files: - return {fn: self._fname_to_reader[fn].concordance( - search_item, participant=participant, exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma) - for fn in self._filenames} + return { + fn: self._fname_to_reader[fn].concordance( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + ) + for fn in self._filenames + } else: output_list = [] for fn in self.filenames(sorted_by_age=True): - output_list.extend(self._fname_to_reader[fn].concordance( - search_item, participant=participant, exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma)) + output_list.extend( + self._fname_to_reader[fn].concordance( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + ) + ) return output_list @@ -808,8 +951,10 @@ def __init__(self, filename=None, str_=None, encoding=ENCODING): self.encoding = encoding if (filename and str_) or (filename is None and str_ is None): - msg = ('_SingleReader is initialized by either one CHAT file or ' - 'one CHAT str (but not both)') + msg = ( + "_SingleReader is initialized by either one CHAT file or " + "one CHAT str (but not both)" + ) raise ValueError(msg) self._filename = os.path.abspath(filename) if filename else None @@ -828,11 +973,21 @@ def __init__(self, filename=None, str_=None, encoding=ENCODING): self._all_tagged_sents = self._create_all_tagged_sents() # for MLUw() and TTR() - self.words_to_ignore = {'', '!', '+...', '.', ',', '?', '‡', - '„', '0', CLITIC} + self.words_to_ignore = { + "", + "!", + "+...", + ".", + ",", + "?", + "‡", + "„", + "0", + CLITIC, + } # for MLUm() - self.pos_to_ignore = {'', '!', '+...', '0', '?', 'BEG'} + self.pos_to_ignore = {"", "!", "+...", "0", "?", "BEG"} def __len__(self): return len(self._index_to_tiers) @@ -845,16 +1000,17 @@ def filename(self): def _get_file_object(self): if self._filename: - return open(self._filename, mode=OPEN_MODE, encoding=self.encoding) + return open(self._filename, mode="r", encoding=self.encoding) else: - return io.TextIOWrapper(io.BytesIO(self._str.encode()), - encoding=self.encoding) + return io.TextIOWrapper( + io.BytesIO(self._str.encode()), encoding=self.encoding + ) def cha_lines(self): """A generator of lines in the CHAT file, with the tab-character line continuations undone. """ - previous_line = '' + previous_line = "" for line in self._get_file_object(): previous_line = previous_line.strip() @@ -863,13 +1019,15 @@ def cha_lines(self): if not current_line: continue - if current_line.startswith('%xpho:') or \ - current_line.startswith('%xmod:'): - current_line = current_line.replace('%x', '%', 1) + if current_line.startswith("%xpho:") or current_line.startswith( + "%xmod:" + ): + current_line = current_line.replace("%x", "%", 1) - if previous_line and current_line.startswith('\t'): - previous_line = u'{} {}'.format( - previous_line, current_line.strip()) # strip \t + if previous_line and current_line.startswith("\t"): + previous_line = u"{} {}".format( + previous_line, current_line.strip() + ) # strip \t elif previous_line: yield previous_line previous_line = current_line @@ -882,7 +1040,7 @@ def _tier_markers(self): result = set() for tiermarkers_to_tiers in self._index_to_tiers.values(): for tier_marker in tiermarkers_to_tiers.keys(): - if tier_marker.startswith('%'): + if tier_marker.startswith("%"): result.add(tier_marker) return result @@ -912,27 +1070,28 @@ def _get_index_to_tiers(self): utterance = None for line in self.cha_lines(): - if line.startswith('@'): + if line.startswith("@"): continue line_split = line.split() - if line.startswith('*'): + if line.startswith("*"): index_ += 1 - participant_code = line_split[0].lstrip('*').rstrip(':') - utterance = ' '.join(line_split[1:]) + participant_code = line_split[0].lstrip("*").rstrip(":") + utterance = " ".join(line_split[1:]) result_with_collapses[index_] = {participant_code: utterance} - elif utterance and line.startswith('%'): - tier_marker = line_split[0].rstrip(':') - result_with_collapses[index_][tier_marker] = \ - ' '.join(line_split[1:]) + elif utterance and line.startswith("%"): + tier_marker = line_split[0].rstrip(":") + result_with_collapses[index_][tier_marker] = " ".join( + line_split[1:] + ) # handle collapses such as [x 4] result_without_collapses = {} new_index = -1 # utterance index (1st utterance is index 0) - collapse_pattern = re.compile(r'\[x \d+?\]') # e.g., "[x ]" - number_regex = re.compile(r'\d+') + collapse_pattern = re.compile(r"\[x \d+?\]") # e.g., "[x ]" + number_regex = re.compile(r"\d+") for old_index in range(len(result_with_collapses)): tier_dict = result_with_collapses[old_index] @@ -987,34 +1146,36 @@ def _get_headers(self): for line in self.cha_lines(): - if line.startswith('@Begin') or line.startswith('@End'): + if line.startswith("@Begin") or line.startswith("@End"): continue - if not line.startswith('@'): + if not line.startswith("@"): continue # find head, e.g., "Languages", "Participants", "ID" etc - head, _, line = line.partition('\t') + head, _, line = line.partition("\t") line = line.strip() - head = head.lstrip('@') # remove beginning "@" - head = head.rstrip(':') # remove ending ":", if any + head = head.lstrip("@") # remove beginning "@" + head = head.rstrip(":") # remove ending ":", if any - if head == 'Participants': - headname_to_entry['Participants'] = {} + if head == "Participants": + headname_to_entry["Participants"] = {} - participants = line.split(',') + participants = line.split(",") for participant in participants: participant = participant.strip() - code, _, participant_label = participant.partition(' ') - participant_name, _, participant_role = \ - participant_label.partition(' ') + code, _, participant_label = participant.partition(" ") + participant_name, _, participant_role = ( + participant_label.partition(" ") + ) # code = participant code, e.g. CHI, MOT - headname_to_entry['Participants'][code] = \ - {'participant_name': participant_name} + headname_to_entry["Participants"][code] = { + "participant_name": participant_name + } - elif head == 'ID': - participant_info = line.split('|')[: -1] + elif head == "ID": + participant_info = line.split("|")[:-1] # final empty str removed code = participant_info[2] @@ -1023,18 +1184,27 @@ def _get_headers(self): # education, custom del participant_info[2] # remove code info (3rd in list) - participant_info_heads = ['language', 'corpus', 'age', 'sex', - 'group', 'SES', 'participant_role', - 'education', 'custom'] - head_to_info = dict(zip(participant_info_heads, - participant_info)) - - headname_to_entry['Participants'][code].update(head_to_info) - - elif head == 'Date': - if 'Date' not in headname_to_entry: - headname_to_entry['Date'] = [] - headname_to_entry['Date'].append(line) + participant_info_heads = [ + "language", + "corpus", + "age", + "sex", + "group", + "SES", + "participant_role", + "education", + "custom", + ] + head_to_info = dict( + zip(participant_info_heads, participant_info) + ) + + headname_to_entry["Participants"][code].update(head_to_info) + + elif head == "Date": + if "Date" not in headname_to_entry: + headname_to_entry["Date"] = [] + headname_to_entry["Date"].append(line) else: headname_to_entry[head] = line @@ -1091,7 +1261,7 @@ def participants(self): 'sex': ''}} """ try: - return self._headers['Participants'] + return self._headers["Participants"] except KeyError: return {} @@ -1100,7 +1270,7 @@ def participant_codes(self): Return the set of participant codes (e.g., `{'CHI', 'MOT', 'FAT'}`). """ try: - return set(self._headers['Participants'].keys()) + return set(self._headers["Participants"].keys()) except KeyError: return set() @@ -1112,11 +1282,11 @@ def languages(self): languages_list = [] try: - languages_line = self._headers['Languages'] + languages_line = self._headers["Languages"] except KeyError: pass else: - for language in languages_line.split(','): + for language in languages_line.split(","): language = language.strip() if language: languages_list.append(language) @@ -1131,7 +1301,7 @@ def dates_of_recording(self): :rtype: list(tuple(int, int, int)) """ try: - dates = self._headers['Date'] + dates = self._headers["Date"] except KeyError: return None @@ -1150,7 +1320,7 @@ def date_of_birth(self): participant_to_date = {} for header in header_keys: - if not header.startswith('Birth of'): + if not header.startswith("Birth of"): continue # e.g., header is 'Birth of CHI', participant is 'CHI' @@ -1165,7 +1335,7 @@ def date_of_birth(self): return participant_to_date - def age(self, participant='CHI', months=False): + def age(self, participant="CHI", months=False): """ Return the age of *participant* as a tuple or a float. @@ -1181,10 +1351,10 @@ def age(self, participant='CHI', months=False): :rtype: tuple or float """ try: - age_ = self._headers['Participants'][participant]['age'] + age_ = self._headers["Participants"][participant]["age"] - year_str, _, month_day = age_.partition(';') - month_str, _, day_str = month_day.partition('.') + year_str, _, month_day = age_.partition(";") + month_str, _, day_str = month_day.partition(".") year_int = int(year_str) if year_str.isdigit() else 0 month_int = int(month_str) if month_str.isdigit() else 0 @@ -1197,8 +1367,9 @@ def age(self, participant='CHI', months=False): except (KeyError, IndexError, ValueError): return None - def utterances(self, participant=None, exclude=None, clean=True, - time_marker=False): + def utterances( + self, participant=None, exclude=None, clean=True, time_marker=False + ): """ Return a list of the utterances by *participant* as (*participant*, *utterance*) pairs. @@ -1238,15 +1409,17 @@ def utterances(self, participant=None, exclude=None, clean=True, try: time_marker = get_time_marker(line) except ValueError as e: - msg = ( - 'At line %d in file %s: ' % - (i, self.filename()) + str(e) - ) + msg = "At line %d in file %s: " % ( + i, + self.filename(), + ) + str(e) raise ValueError(msg) output.append( - (tier_marker, - clean_utterance(line), - time_marker) + ( + tier_marker, + clean_utterance(line), + time_marker, + ) ) else: output.append((tier_marker, clean_utterance(line))) @@ -1276,28 +1449,33 @@ def _determine_participants(self, participant, exclude): if participant is None: include_participants = self.participant_codes() - elif isinstance(participant, (str, unicode_)): + elif isinstance(participant, str): include_participants = {participant} - elif hasattr(participant, '__iter__'): + elif hasattr(participant, "__iter__"): include_participants = set(participant) else: - raise TypeError('"participant" should be either str or ' - 'an iterable of str: {}' - .format(repr(participant))) + raise TypeError( + '"participant" should be either str or ' + "an iterable of str: {}".format(repr(participant)) + ) if exclude is None: exclude_participants = set() - elif isinstance(exclude, (str, unicode_)): + elif isinstance(exclude, str): exclude_participants = {exclude} - elif hasattr(exclude, '__iter__'): + elif hasattr(exclude, "__iter__"): exclude_participants = set(exclude) else: - raise TypeError('"exclude" should be either str or ' - 'an iterable of str: {}' - .format(repr(exclude))) + raise TypeError( + '"exclude" should be either str or ' + "an iterable of str: {}".format(repr(exclude)) + ) - return {p for p in self.participant_codes() - if p in include_participants and p not in exclude_participants} + return { + p + for p in self.participant_codes() + if p in include_participants and p not in exclude_participants + } def words(self, participant=None, exclude=None): """ @@ -1315,8 +1493,9 @@ def words(self, participant=None, exclude=None): For child-directed speech (i.e., targeting all participant except ``'CHI'``), use ``^(?!.*CHI).*$``. """ - return self._get_words(participant=participant, exclude=exclude, - tagged=False, sents=False) + return self._get_words( + participant=participant, exclude=exclude, tagged=False, sents=False + ) def tagged_words(self, participant=None, exclude=None): """ @@ -1334,8 +1513,9 @@ def tagged_words(self, participant=None, exclude=None): For child-directed speech (i.e., targeting all participant except ``'CHI'``), use ``^(?!.*CHI).*$``. """ - return self._get_words(participant=participant, exclude=exclude, - tagged=True, sents=False) + return self._get_words( + participant=participant, exclude=exclude, tagged=True, sents=False + ) def sents(self, participant=None, exclude=None): """ @@ -1355,8 +1535,9 @@ def sents(self, participant=None, exclude=None): For child-directed speech (i.e., targeting all participant except ``'CHI'``), use ``^(?!.*CHI).*$``. """ - return self._get_words(participant=participant, exclude=exclude, - tagged=False, sents=True) + return self._get_words( + participant=participant, exclude=exclude, tagged=False, sents=True + ) def tagged_sents(self, participant=None, exclude=None): """ @@ -1376,11 +1557,13 @@ def tagged_sents(self, participant=None, exclude=None): For child-directed speech (i.e., targeting all participant except ``'CHI'``), use ``^(?!.*CHI).*$``. """ - return self._get_words(participant=participant, exclude=exclude, - tagged=True, sents=True) + return self._get_words( + participant=participant, exclude=exclude, tagged=True, sents=True + ) - def _get_words(self, participant=None, exclude=None, tagged=True, - sents=True): + def _get_words( + self, participant=None, exclude=None, tagged=True, sents=True + ): """ Extract words for the specified participant(s). @@ -1463,8 +1646,7 @@ def _create_all_tagged_sents(self): participant_code = get_participant_code(tiermarker_to_line.keys()) # get the plain words from utterance tier - utterance = clean_utterance( - tiermarker_to_line[participant_code]) + utterance = clean_utterance(tiermarker_to_line[participant_code]) words = utterance.split() # %mor tier @@ -1472,14 +1654,14 @@ def _create_all_tagged_sents(self): clitic_count = 0 mor_items = [] - if '%mor' in tiermarker_to_line: - mor_split = tiermarker_to_line['%mor'].split() + if "%mor" in tiermarker_to_line: + mor_split = tiermarker_to_line["%mor"].split() for j, item in enumerate(mor_split): - tilde_count = item.count('~') + tilde_count = item.count("~") if tilde_count: - item_split = item.split('~') + item_split = item.split("~") for k in range(tilde_count): clitic_indices.append(clitic_count + j + k + 1) @@ -1492,21 +1674,25 @@ def _create_all_tagged_sents(self): mor_items.append(item) if mor_items and ((len(words) + clitic_count) != len(mor_items)): - message = 'cannot align the utterance and %mor tiers:\n' + \ - 'Filename: {}\nTiers --\n{}\n' + \ - 'Cleaned-up utterance --\n{}' - raise ValueError(message.format( - self.filename(), - pformat(tiermarker_to_line), utterance)) + message = ( + "cannot align the utterance and %mor tiers:\n" + + "Filename: {}\nTiers --\n{}\n" + + "Cleaned-up utterance --\n{}" + ) + raise ValueError( + message.format( + self.filename(), pformat(tiermarker_to_line), utterance + ) + ) # %gra tier gra_items = [] - if '%gra' in tiermarker_to_line: - for item in tiermarker_to_line['%gra'].split(): + if "%gra" in tiermarker_to_line: + for item in tiermarker_to_line["%gra"].split(): # an item is a string like '1|2|SUBJ' item_list = [] - for element in item.split('|'): + for element in item.split("|"): try: converted_element = int(element) except ValueError: @@ -1516,15 +1702,17 @@ def _create_all_tagged_sents(self): gra_items.append(tuple(item_list)) - if mor_items and gra_items and \ - (len(mor_items) != len(gra_items)): - raise ValueError('cannot align the %mor and %gra tiers:\n{}' - .format(pformat(tiermarker_to_line))) + if mor_items and gra_items and (len(mor_items) != len(gra_items)): + raise ValueError( + "cannot align the %mor and %gra tiers:\n{}".format( + pformat(tiermarker_to_line) + ) + ) # utterance tier if mor_items and clitic_count: word_iterator = iter(words) - utterance_items = [''] * len(mor_items) + utterance_items = [""] * len(mor_items) for j in range(len(mor_items)): if j in clitic_indices: @@ -1536,14 +1724,14 @@ def _create_all_tagged_sents(self): # determine what to yield (and how) to create the generator if not mor_items: - mor_items = [''] * len(utterance_items) + mor_items = [""] * len(utterance_items) if not gra_items: - gra_items = [''] * len(utterance_items) + gra_items = [""] * len(utterance_items) sent = [] for word, mor, gra in zip(utterance_items, mor_items, gra_items): - pos, _, mor = mor.partition('|') + pos, _, mor = mor.partition("|") output_word = (clean_word(word), pos.upper(), mor, gra) # pos in uppercase follows NLTK convention @@ -1603,8 +1791,9 @@ def part_of_speech_tags(self, participant=None, exclude=None): except ``'CHI'``), use ``^(?!.*CHI).*$``. """ output_set = set() - tagged_words = self.tagged_words(participant=participant, - exclude=exclude) + tagged_words = self.tagged_words( + participant=participant, exclude=exclude + ) for tagged_word in tagged_words: pos = tagged_word[1] @@ -1636,12 +1825,12 @@ def word_ngrams(self, n, participant=None, exclude=None, keep_case=True): lowercase. """ if (type(n) is not int) or (n < 1): - raise ValueError('n must be a positive integer: %r' % n) + raise ValueError("n must be a positive integer: %r" % n) if n == 1: - return self.word_frequency(participant=participant, - exclude=exclude, - keep_case=keep_case) + return self.word_frequency( + participant=participant, exclude=exclude, keep_case=keep_case + ) sents = self.sents(participant=participant, exclude=exclude) output_counter = Counter() @@ -1656,78 +1845,114 @@ def word_ngrams(self, n, participant=None, exclude=None, keep_case=True): return output_counter - def MLU(self, participant='CHI', exclude=None): + def MLU(self, participant="CHI", exclude=None): """ Return the MLU in morphemes for *participant* (default to ``'CHI'``); same as ``MLUm()``. :param participant: The participant specified, default to ``'CHI'`` """ - return get_MLUm(self.tagged_sents(participant=participant, - exclude=exclude), - pos_to_ignore=self.pos_to_ignore) + return get_MLUm( + self.tagged_sents(participant=participant, exclude=exclude), + pos_to_ignore=self.pos_to_ignore, + ) - def MLUm(self, participant='CHI', exclude=None): + def MLUm(self, participant="CHI", exclude=None): """ Return the MLU in morphemes for *participant* (default to ``'CHI'``); same as ``MLU()``. :param participant: The participant specified, default to ``'CHI'`` """ - return get_MLUm(self.tagged_sents(participant=participant, - exclude=exclude), - pos_to_ignore=self.pos_to_ignore) + return get_MLUm( + self.tagged_sents(participant=participant, exclude=exclude), + pos_to_ignore=self.pos_to_ignore, + ) - def MLUw(self, participant='CHI', exclude=None): + def MLUw(self, participant="CHI", exclude=None): """ Return the mean length of utterance (MLU) in words for *participant* (default to ``'CHI'``). :param participant: The participant specified, default to ``'CHI'`` """ - return get_MLUw(self.sents(participant=participant, exclude=exclude), - words_to_ignore=self.words_to_ignore) + return get_MLUw( + self.sents(participant=participant, exclude=exclude), + words_to_ignore=self.words_to_ignore, + ) - def TTR(self, participant='CHI', exclude=None): + def TTR(self, participant="CHI", exclude=None): """ Return the type-token ratio (TTR) for *participant* (default to ``'CHI'``). :param participant: The participant specified, default to ``'CHI'`` """ - return get_TTR(self.word_frequency(participant=participant, - exclude=exclude), - words_to_ignore=self.words_to_ignore) + return get_TTR( + self.word_frequency(participant=participant, exclude=exclude), + words_to_ignore=self.words_to_ignore, + ) - def IPSyn(self, participant='CHI', exclude=None): + def IPSyn(self, participant="CHI", exclude=None): """ Return the index of productive syntax (IPSyn) for *participant* (default to ``'CHI'``). :param participant: The participant specified, default to ``'CHI'`` """ - return get_IPSyn(self.tagged_sents(participant=participant, - exclude=exclude)) - - def search(self, search_item, participant=None, exclude=None, - match_entire_word=True, lemma=False, - output_tagged=True, output_sents=True): - return self._search(search_item, participant=participant, - exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma, - concordance=False, output_tagged=output_tagged, - output_sents=output_sents) - - def concordance(self, search_item, participant=None, exclude=None, - match_entire_word=True, lemma=False): - return self._search(search_item, participant=participant, - exclude=exclude, - match_entire_word=match_entire_word, lemma=lemma, - concordance=True) - - def _search(self, search_item, participant=None, exclude=None, - match_entire_word=True, lemma=False, concordance=False, - output_tagged=True, output_sents=True): + return get_IPSyn( + self.tagged_sents(participant=participant, exclude=exclude) + ) + + def search( + self, + search_item, + participant=None, + exclude=None, + match_entire_word=True, + lemma=False, + output_tagged=True, + output_sents=True, + ): + return self._search( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + concordance=False, + output_tagged=output_tagged, + output_sents=output_sents, + ) + + def concordance( + self, + search_item, + participant=None, + exclude=None, + match_entire_word=True, + lemma=False, + ): + return self._search( + search_item, + participant=participant, + exclude=exclude, + match_entire_word=match_entire_word, + lemma=lemma, + concordance=True, + ) + + def _search( + self, + search_item, + participant=None, + exclude=None, + match_entire_word=True, + lemma=False, + concordance=False, + output_tagged=True, + output_sents=True, + ): taggedsent_charnumber_list = [] # = list of (tagged_sent, char_number) @@ -1737,8 +1962,9 @@ def _search(self, search_item, participant=None, exclude=None, else: match_function = lambda search_, test_: search_ in test_ - tagged_sents = self.tagged_sents(participant=participant, - exclude=exclude) + tagged_sents = self.tagged_sents( + participant=participant, exclude=exclude + ) for tagged_sent in tagged_sents: for i, tagged_word in enumerate(tagged_sent): @@ -1759,12 +1985,17 @@ def _search(self, search_item, participant=None, exclude=None, if match_function(search_item, test_item): preceding_words = [tagged_sent[k][0] for k in range(i)] - preceding_words = [w for w in preceding_words - if w != CLITIC] # remove CLITIC - char_number = (sum(len(w) for w in preceding_words) - + len(preceding_words) - 1) # plus spaces - taggedsent_charnumber_list.append((tagged_sent, - char_number)) + preceding_words = [ + w for w in preceding_words if w != CLITIC + ] # remove CLITIC + char_number = ( + sum(len(w) for w in preceding_words) + + len(preceding_words) + - 1 + ) # plus spaces + taggedsent_charnumber_list.append( + (tagged_sent, char_number) + ) if not taggedsent_charnumber_list: # if empty return taggedsent_charnumber_list @@ -1779,8 +2010,9 @@ def _search(self, search_item, participant=None, exclude=None, if output_tagged: sent_to_add = lambda sent_: sent_ else: - sent_to_add = lambda sent_: [x[0] - for x in sent_ if x[0] != CLITIC] + sent_to_add = lambda sent_: [ + x[0] for x in sent_ if x[0] != CLITIC + ] result_list = [] @@ -1793,10 +2025,12 @@ def _search(self, search_item, participant=None, exclude=None, result_list = [] for tagged_sent, char_number in taggedsent_charnumber_list: - sent = [word_ for word_, _, _, _ in tagged_sent - if word_ != CLITIC] - sent_str = (' ' * (max_char_number - char_number) - + ' '.join(sent)) + sent = [ + word_ for word_, _, _, _ in tagged_sent if word_ != CLITIC + ] + sent_str = " " * (max_char_number - char_number) + " ".join( + sent + ) result_list.append(sent_str) return result_list diff --git a/pylangacq/compat.py b/pylangacq/compat.py deleted file mode 100644 index cf529ca..0000000 --- a/pylangacq/compat.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Compatibility between Python 2 and 3.""" - -import sys -import io - - -if sys.version_info[0] == 2: # pragma: no coverage - open = io.open - unicode_ = unicode # noqa F821 (undefined name 'unicode' in python >= 3) - OPEN_MODE = 'rU' -else: # pragma: no coverage - open = open - unicode_ = str - # 'U' deprecated since python 3.4, to removed in python 4.0 - # https://docs.python.org/3/library/functions.html#open - OPEN_MODE = 'r' - - -try: - FileNotFoundError = FileNotFoundError -except NameError: - FileNotFoundError = IOError diff --git a/pylangacq/dependency.py b/pylangacq/dependency.py index b10dccf..1d505f4 100644 --- a/pylangacq/dependency.py +++ b/pylangacq/dependency.py @@ -9,6 +9,7 @@ class DependencyGraph(object): tagged_sent : list of tuple(str, str, str, str) A tagged sentence as a list of (word, pos, mor, rel). """ + def __init__(self, tagged_sent): self.node = {} # from node to dict (node's properties) self.edge = {} # from node to node to dict (edge's properties) @@ -69,19 +70,13 @@ def _create_graph_from_chat(self): except ValueError: node1 = -1 node2 = -1 - relation = '**ERROR**' + relation = "**ERROR**" self._faulty = True self.add_edge(node1, node2, rel=relation) - self.node[node1] = {'word': word, - 'pos': pos, - 'mor': mor, - } + self.node[node1] = {"word": word, "pos": pos, "mor": mor} - self.node[0] = {'word': 'ROOT', - 'pos': 'ROOT', - 'mor': 'ROOT', - } + self.node[0] = {"word": "ROOT", "pos": "ROOT", "mor": "ROOT"} def faulty(self): """Determine whether the graph is faulty for dependency information. @@ -100,43 +95,47 @@ def to_tikz(self): str The LaTeX tikz-dependency code for drawing the graph """ - tikz_dep_code = '' + tikz_dep_code = "" # get graph info dep_to_head = dict(self.edges()) number_of_nodes = self.number_of_nodes() # add \begin{deptext}...\end{deptext} - words = [self.node[n]['word'] - for n in range(1, number_of_nodes)] - deptext_template = (u' \\begin{{deptext}}[column sep=1em]\n' - ' {} \\\\ \n' - ' \\end{{deptext}}\n') - tikz_dep_code += deptext_template.format(' \\& '.join(words)) + words = [self.node[n]["word"] for n in range(1, number_of_nodes)] + deptext_template = ( + u" \\begin{{deptext}}[column sep=1em]\n" + " {} \\\\ \n" + " \\end{{deptext}}\n" + ) + tikz_dep_code += deptext_template.format(" \\& ".join(words)) # add the \deproot line dep_shooting_to_root = 0 - root_rel = '' + root_rel = "" for dep in range(1, number_of_nodes): head = dep_to_head[dep] if head == 0: dep_shooting_to_root = dep - root_rel = self.edge[dep_shooting_to_root][0]['rel'] + root_rel = self.edge[dep_shooting_to_root][0]["rel"] break - tikz_dep_code += u' \\deproot{{{}}}{{{}}}\n'.format( - dep_shooting_to_root, root_rel) + tikz_dep_code += u" \\deproot{{{}}}{{{}}}\n".format( + dep_shooting_to_root, root_rel + ) # add the \depedge lines for dep in range(1, number_of_nodes): head = dep_to_head[dep] - rel = self.edge[dep][head]['rel'] - tikz_dep_code += u' \\depedge{{{}}}{{{}}}{{{}}}\n'.format( - dep, head, rel) + rel = self.edge[dep][head]["rel"] + tikz_dep_code += u" \\depedge{{{}}}{{{}}}{{{}}}\n".format( + dep, head, rel + ) # return tikz_dep_code # wrapped inside \begin{dependency}...\end{dependency} - dependency_template = (u'\\begin{{dependency}}[theme = simple]\n' - '{}\\end{{dependency}}') + dependency_template = ( + u"\\begin{{dependency}}[theme = simple]\n" "{}\\end{{dependency}}" + ) return dependency_template.format(tikz_dep_code) def to_conll(self): @@ -152,9 +151,9 @@ def to_conll(self): for dep in range(1, self.number_of_nodes()): head = dep_to_head[dep] - word = self.node[dep]['word'] - pos = self.node[dep]['pos'] - rel = self.edge[dep][head]['rel'] - collector.append(u'{} {} {} {}'.format(word, pos, head, rel)) + word = self.node[dep]["word"] + pos = self.node[dep]["pos"] + rel = self.edge[dep][head]["rel"] + collector.append(u"{} {} {} {}".format(word, pos, head, rel)) - return '\n'.join(collector) + return "\n".join(collector) diff --git a/pylangacq/measures.py b/pylangacq/measures.py index f57f6df..c8160b9 100644 --- a/pylangacq/measures.py +++ b/pylangacq/measures.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import division - from pylangacq.util import CLITIC, get_lemma_from_mor from pylangacq.dependency import DependencyGraph @@ -24,8 +20,8 @@ def get_MLUm(tagged_sents, pos_to_ignore=None): continue total_morpheme_count += 1 - total_morpheme_count += morph.count('-') - total_morpheme_count += morph.count('~') + total_morpheme_count += morph.count("-") + total_morpheme_count += morph.count("~") if total_utterance_count: return total_morpheme_count / total_utterance_count @@ -59,8 +55,11 @@ def get_TTR(word_freq_dict, words_to_ignore=None): """Type-token ratio (TTR)""" # *word_freq_dict* already filtered for the desired participant like 'CHI' if words_to_ignore: - word_freq_dict = {word: freq for word, freq in word_freq_dict.items() - if word not in words_to_ignore} + word_freq_dict = { + word: freq + for word, freq in word_freq_dict.items() + if word not in words_to_ignore + } return len(word_freq_dict) / sum(word_freq_dict.values()) @@ -68,46 +67,124 @@ def get_TTR(word_freq_dict, words_to_ignore=None): def get_IPSyn(tagged_sents): """Index of Productive Syntax (IPSyn)""" if len(tagged_sents) > 100: - tagged_sents = tagged_sents[: 100] + tagged_sents = tagged_sents[:100] scoring_board = { - 'N1': 0, 'N2': 0, 'N3': 0, 'N4': 0, 'N5': 0, 'N6': 0, - 'N7': 0, 'N8': 0, 'N9': 0, 'N10': 0, 'N11': 0, - - 'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0, 'V6': 0, - 'V7': 0, 'V8': 0, 'V9': 0, 'V10': 0, 'V11': 0, 'V12': 0, - 'V13': 0, 'V14': 0, 'V15': 0, 'V16': 0, - - 'Q1': 0, 'Q2': 0, 'Q3': 0, 'Q4': 0, 'Q5': 0, 'Q6': 0, - 'Q7': 0, 'Q8': 0, 'Q9': 0, 'Q10': 0, - - 'S1': 0, 'S2': 0, 'S3': 0, 'S4': 0, 'S5': 0, 'S6': 0, - 'S7': 0, 'S8': 0, 'S9': 0, 'S10': 0, 'S11': 0, 'S12': 0, - 'S13': 0, 'S14': 0, 'S15': 0, 'S16': 0, 'S17': 0, 'S18': 0, - 'S19': 0, + "N1": 0, + "N2": 0, + "N3": 0, + "N4": 0, + "N5": 0, + "N6": 0, + "N7": 0, + "N8": 0, + "N9": 0, + "N10": 0, + "N11": 0, + "V1": 0, + "V2": 0, + "V3": 0, + "V4": 0, + "V5": 0, + "V6": 0, + "V7": 0, + "V8": 0, + "V9": 0, + "V10": 0, + "V11": 0, + "V12": 0, + "V13": 0, + "V14": 0, + "V15": 0, + "V16": 0, + "Q1": 0, + "Q2": 0, + "Q3": 0, + "Q4": 0, + "Q5": 0, + "Q6": 0, + "Q7": 0, + "Q8": 0, + "Q9": 0, + "Q10": 0, + "S1": 0, + "S2": 0, + "S3": 0, + "S4": 0, + "S5": 0, + "S6": 0, + "S7": 0, + "S8": 0, + "S9": 0, + "S10": 0, + "S11": 0, + "S12": 0, + "S13": 0, + "S14": 0, + "S15": 0, + "S16": 0, + "S17": 0, + "S18": 0, + "S19": 0, } scoring_board_stop = { - 'N1': False, 'N2': False, 'N3': False, 'N4': False, - 'N5': False, 'N6': False, 'N7': False, 'N8': False, - 'N9': False, 'N10': False, 'N11': False, - - 'V1': False, 'V2': False, 'V3': False, 'V4': False, - 'V5': False, 'V6': False, 'V7': False, 'V8': False, - 'V9': False, 'V10': False, 'V11': False, 'V12': False, - 'V13': False, 'V14': False, 'V15': False, - 'V16': False, - - 'Q1': False, 'Q2': False, 'Q3': False, 'Q4': False, - 'Q5': False, 'Q6': False, 'Q7': False, 'Q8': False, - 'Q9': False, 'Q10': False, - - 'S1': False, 'S2': False, 'S3': False, 'S4': False, - 'S5': False, 'S6': False, 'S7': False, 'S8': False, - 'S9': False, 'S10': False, 'S11': False, 'S12': False, - 'S13': False, 'S14': False, 'S15': False, - 'S16': False, 'S17': False, 'S18': False, - 'S19': False, + "N1": False, + "N2": False, + "N3": False, + "N4": False, + "N5": False, + "N6": False, + "N7": False, + "N8": False, + "N9": False, + "N10": False, + "N11": False, + "V1": False, + "V2": False, + "V3": False, + "V4": False, + "V5": False, + "V6": False, + "V7": False, + "V8": False, + "V9": False, + "V10": False, + "V11": False, + "V12": False, + "V13": False, + "V14": False, + "V15": False, + "V16": False, + "Q1": False, + "Q2": False, + "Q3": False, + "Q4": False, + "Q5": False, + "Q6": False, + "Q7": False, + "Q8": False, + "Q9": False, + "Q10": False, + "S1": False, + "S2": False, + "S3": False, + "S4": False, + "S5": False, + "S6": False, + "S7": False, + "S8": False, + "S9": False, + "S10": False, + "S11": False, + "S12": False, + "S13": False, + "S14": False, + "S15": False, + "S16": False, + "S17": False, + "S18": False, + "S19": False, } def add_one_point_if_needed(item): @@ -162,12 +239,12 @@ def N1(graph): N1: Proper, mass, or count noun """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos.startswith('N:') or pos == 'N': - scoring_board['N1'] += 1 + if pos.startswith("N:") or pos == "N": + scoring_board["N1"] += 1 - if turn_off_scoring_board('N1'): + if turn_off_scoring_board("N1"): break # noinspection PyPep8Naming @@ -177,12 +254,12 @@ def N2(graph): N2: Pronoun or prolocative, excluding modifiers """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos.startswith('PRO') and pos != 'PRO:POSS:DET': - scoring_board['N2'] += 1 + if pos.startswith("PRO") and pos != "PRO:POSS:DET": + scoring_board["N2"] += 1 - if turn_off_scoring_board('N2'): + if turn_off_scoring_board("N2"): break # noinspection PyPep8Naming @@ -192,12 +269,12 @@ def N3(graph): N3: Modifier, including adjectives, possessives, and quantifiers """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos in {'PRO:POSS:DET', 'ADJ', 'QN'}: - scoring_board['N3'] += 1 + if pos in {"PRO:POSS:DET", "ADJ", "QN"}: + scoring_board["N3"] += 1 - if turn_off_scoring_board('N3'): + if turn_off_scoring_board("N3"): break # noinspection PyPep8Naming @@ -210,14 +287,15 @@ def N4(graph): return for i in range(1, graph.number_of_nodes() - 1): - pos1 = graph.node[i]['pos'] - pos2 = graph.node[i + 1]['pos'] + pos1 = graph.node[i]["pos"] + pos2 = graph.node[i + 1]["pos"] - if pos1 in {'PRO:POSS:DET', 'ADJ', 'QN'} and \ - (pos2.startswith('N:') or pos2 == 'N'): - scoring_board['N4'] += 1 + if pos1 in {"PRO:POSS:DET", "ADJ", "QN"} and ( + pos2.startswith("N:") or pos2 == "N" + ): + scoring_board["N4"] += 1 - if turn_off_scoring_board('N4'): + if turn_off_scoring_board("N4"): break # noinspection PyPep8Naming @@ -230,14 +308,14 @@ def N5(graph): return for i in range(1, graph.number_of_nodes() - 1): - pos1 = graph.node[i]['pos'] - pos2 = graph.node[i + 1]['pos'] + pos1 = graph.node[i]["pos"] + pos2 = graph.node[i + 1]["pos"] - if pos1 == 'DET' and (pos2.startswith('N:') or pos2 == 'N'): - scoring_board['N5'] += 1 - add_one_point_if_needed('N4') + if pos1 == "DET" and (pos2.startswith("N:") or pos2 == "N"): + scoring_board["N5"] += 1 + add_one_point_if_needed("N4") - if turn_off_scoring_board('N5'): + if turn_off_scoring_board("N5"): break # noinspection PyPep8Naming @@ -250,17 +328,19 @@ def N6(graph): return for i in range(1, graph.number_of_nodes() - 2): - pos1 = graph.node[i]['pos'] - pos2 = graph.node[i + 1]['pos'] - pos3 = graph.node[i + 2]['pos'] - - if pos2 in {'PRO:POSS:DET', 'ADJ', 'QN'} and \ - (pos3.startswith('N:') or pos3 == 'N') and \ - (pos1 in {'V', 'PREP'}): - scoring_board['N6'] += 1 - add_one_point_if_needed('N4') - - if turn_off_scoring_board('N6'): + pos1 = graph.node[i]["pos"] + pos2 = graph.node[i + 1]["pos"] + pos3 = graph.node[i + 2]["pos"] + + if ( + pos2 in {"PRO:POSS:DET", "ADJ", "QN"} + and (pos3.startswith("N:") or pos3 == "N") + and (pos1 in {"V", "PREP"}) + ): + scoring_board["N6"] += 1 + add_one_point_if_needed("N4") + + if turn_off_scoring_board("N6"): break # noinspection PyPep8Naming @@ -270,12 +350,12 @@ def N7(graph): N7: Plural suffix """ for i in range(1, graph.number_of_nodes()): - mor = graph.node[i]['mor'] + mor = graph.node[i]["mor"] - if '-PL' in mor: - scoring_board['N7'] += 1 + if "-PL" in mor: + scoring_board["N7"] += 1 - if turn_off_scoring_board('N7'): + if turn_off_scoring_board("N7"): break # noinspection PyPep8Naming @@ -288,17 +368,19 @@ def N8(graph): return for i in range(1, graph.number_of_nodes() - 2): - pos1 = graph.node[i]['pos'] - pos2 = graph.node[i + 1]['pos'] - pos3 = graph.node[i + 2]['pos'] - - if pos1 in {'PRO:POSS:DET', 'ADJ', 'QN'} and \ - (pos2.startswith('N:') or pos2 == 'N') and \ - (pos3 == 'V'): - scoring_board['N8'] += 1 - add_one_point_if_needed('N4') - - if turn_off_scoring_board('N8'): + pos1 = graph.node[i]["pos"] + pos2 = graph.node[i + 1]["pos"] + pos3 = graph.node[i + 2]["pos"] + + if ( + pos1 in {"PRO:POSS:DET", "ADJ", "QN"} + and (pos2.startswith("N:") or pos2 == "N") + and (pos3 == "V") + ): + scoring_board["N8"] += 1 + add_one_point_if_needed("N4") + + if turn_off_scoring_board("N8"): break # noinspection PyPep8Naming @@ -311,17 +393,19 @@ def N9(graph): return for i in range(1, graph.number_of_nodes() - 2): - pos1 = graph.node[i]['pos'] - pos2 = graph.node[i + 1]['pos'] - pos3 = graph.node[i + 2]['pos'] - - if (pos1 in {'PRO:POSS:DET', 'ADJ', 'QN'}) and \ - (pos2 in {'ADJ', 'QN'}) and \ - (pos3.startswith('N:') or pos3 == 'N'): - scoring_board['N9'] += 1 - add_one_point_if_needed('N4') - - if turn_off_scoring_board('N9'): + pos1 = graph.node[i]["pos"] + pos2 = graph.node[i + 1]["pos"] + pos3 = graph.node[i + 2]["pos"] + + if ( + (pos1 in {"PRO:POSS:DET", "ADJ", "QN"}) + and (pos2 in {"ADJ", "QN"}) + and (pos3.startswith("N:") or pos3 == "N") + ): + scoring_board["N9"] += 1 + add_one_point_if_needed("N4") + + if turn_off_scoring_board("N9"): break # noinspection PyPep8Naming @@ -334,18 +418,18 @@ def N10(graph): return for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'ADV': + if pos == "ADV": for j in graph.edge[i].keys(): - pos_of_head = graph.node[j]['pos'] + pos_of_head = graph.node[j]["pos"] - if pos_of_head in {'ADJ', 'N'}: - scoring_board['N10'] += 1 - add_one_point_if_needed('V8') + if pos_of_head in {"ADJ", "N"}: + scoring_board["N10"] += 1 + add_one_point_if_needed("V8") break - if turn_off_scoring_board('N10'): + if turn_off_scoring_board("N10"): break # noinspection PyPep8Naming @@ -355,16 +439,16 @@ def N11(graph): N11: Any other bound morpheme on N or adjective """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos in {'N', 'ADJ'} or pos.startswith('N:'): - mor = graph.node[i]['mor'] - mor = mor.replace('-PL', '') + if pos in {"N", "ADJ"} or pos.startswith("N:"): + mor = graph.node[i]["mor"] + mor = mor.replace("-PL", "") - if '-' in mor: - scoring_board['N11'] += 1 + if "-" in mor: + scoring_board["N11"] += 1 - if turn_off_scoring_board('N11'): + if turn_off_scoring_board("N11"): break # noinspection PyPep8Naming @@ -374,12 +458,12 @@ def V1(graph): V1: Verb """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'V': - scoring_board['V1'] += 1 + if pos == "V": + scoring_board["V1"] += 1 - if turn_off_scoring_board('V1'): + if turn_off_scoring_board("V1"): break # noinspection PyPep8Naming @@ -389,12 +473,12 @@ def V2(graph): V2: Particle or Preposition """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'PREP': - scoring_board['V2'] += 1 + if pos == "PREP": + scoring_board["V2"] += 1 - if turn_off_scoring_board('V2'): + if turn_off_scoring_board("V2"): break # noinspection PyPep8Naming @@ -406,11 +490,11 @@ def V3(graph): for i in range(1, graph.number_of_nodes()): for j in graph.edge[i].keys(): - if graph.edge[i][j]['rel'] == 'POBJ': - scoring_board['V3'] += 1 - add_one_point_if_needed('V2') + if graph.edge[i][j]["rel"] == "POBJ": + scoring_board["V3"] += 1 + add_one_point_if_needed("V2") - if turn_off_scoring_board('V3'): + if turn_off_scoring_board("V3"): break # noinspection PyPep8Naming @@ -423,8 +507,8 @@ def V4(graph): return for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] - if pos != 'COP': + pos = graph.node[i]["pos"] + if pos != "COP": continue subject = False @@ -434,17 +518,18 @@ def V4(graph): if head != i: continue - if graph.edge[dep][head]['rel'] == 'SUBJ' and \ - not graph.node[dep]['pos'].endswith('WH'): + if graph.edge[dep][head]["rel"] == "SUBJ" and not graph.node[ + dep + ]["pos"].endswith("WH"): subject = True - elif graph.edge[dep][head]['rel'] == 'PRED': + elif graph.edge[dep][head]["rel"] == "PRED": predicate = True if subject and predicate: - scoring_board['V4'] += 1 - add_one_point_if_needed('V1') + scoring_board["V4"] += 1 + add_one_point_if_needed("V1") - if turn_off_scoring_board('V4'): + if turn_off_scoring_board("V4"): break # noinspection PyPep8Naming @@ -456,24 +541,34 @@ def V5(graph): if not graph.number_of_nodes() > 2: return - pseudo_aux = {'hafta', 'haf(ta)', - 's\'pose(da)', 's\'poseda', - 'gonna', 'gon(na)', - 'wanna', 'wanta', 'wan(t)(a)', 'want(a)', 'wan(na)', - 'gotta', 'got(ta)', - 'better'} + pseudo_aux = { + "hafta", + "haf(ta)", + "s'pose(da)", + "s'poseda", + "gonna", + "gon(na)", + "wanna", + "wanta", + "wan(t)(a)", + "want(a)", + "wan(na)", + "gotta", + "got(ta)", + "better", + } for i in range(1, graph.number_of_nodes() - 1): - pos2 = graph.node[i + 1]['pos'] - if pos2 != 'V': + pos2 = graph.node[i + 1]["pos"] + if pos2 != "V": continue - word1 = graph.node[i]['word'] + word1 = graph.node[i]["word"] if word1 in pseudo_aux: - scoring_board['V5'] += 1 + scoring_board["V5"] += 1 - if turn_off_scoring_board('V5'): + if turn_off_scoring_board("V5"): break # noinspection PyPep8Naming @@ -483,16 +578,17 @@ def V6(graph): V6: Auxiliary be, do, have in VP (Also credit: V5) """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] - mor = graph.node[i]['mor'] + pos = graph.node[i]["pos"] + mor = graph.node[i]["mor"] lemma = get_lemma_from_mor(mor) - if (pos == 'AUX' and not mor.startswith('wi')) or \ - (lemma == 'do' and pos == 'MOD'): - scoring_board['V6'] += 1 - add_one_point_if_needed('V5') + if (pos == "AUX" and not mor.startswith("wi")) or ( + lemma == "do" and pos == "MOD" + ): + scoring_board["V6"] += 1 + add_one_point_if_needed("V5") - if turn_off_scoring_board('V6'): + if turn_off_scoring_board("V6"): break # noinspection PyPep8Naming @@ -502,12 +598,12 @@ def V7(graph): V7: Progressive suffix """ for i in range(1, graph.number_of_nodes()): - mor = graph.node[i]['mor'] + mor = graph.node[i]["mor"] - if mor.endswith('PRESP'): - scoring_board['V7'] += 1 + if mor.endswith("PRESP"): + scoring_board["V7"] += 1 - if turn_off_scoring_board('V7'): + if turn_off_scoring_board("V7"): break # noinspection PyPep8Naming @@ -517,12 +613,12 @@ def V8(graph): V8: Adverbs """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'ADV': - scoring_board['V8'] += 1 + if pos == "ADV": + scoring_board["V8"] += 1 - if turn_off_scoring_board('V8'): + if turn_off_scoring_board("V8"): break # noinspection PyPep8Naming @@ -535,15 +631,15 @@ def V9(graph): return for i in range(1, graph.number_of_nodes() - 1): - pos = graph.node[i]['pos'] - word = graph.node[i]['word'] - pos2 = graph.node[i + 1]['pos'] + pos = graph.node[i]["pos"] + word = graph.node[i]["word"] + pos2 = graph.node[i + 1]["pos"] - if pos.startswith('MOD') and pos2 == 'V' and word != CLITIC: - scoring_board['V9'] += 1 - add_one_point_if_needed('V5') + if pos.startswith("MOD") and pos2 == "V" and word != CLITIC: + scoring_board["V9"] += 1 + add_one_point_if_needed("V5") - if turn_off_scoring_board('V9'): + if turn_off_scoring_board("V9"): break # noinspection PyPep8Naming @@ -553,12 +649,12 @@ def V10(graph): V10: Third person singular present tense suffix """ for i in range(1, graph.number_of_nodes()): - mor = graph.node[i]['mor'] + mor = graph.node[i]["mor"] - if '-3S' in mor: - scoring_board['V10'] += 1 + if "-3S" in mor: + scoring_board["V10"] += 1 - if turn_off_scoring_board('V10'): + if turn_off_scoring_board("V10"): break # noinspection PyPep8Naming @@ -567,19 +663,19 @@ def V11(graph): """ V11: Past tense modal (Also credit V9) """ - past_tense_modals = {'could', 'did', 'might', 'would', 'woudn\'t'} + past_tense_modals = {"could", "did", "might", "would", "woudn't"} for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos != 'MOD': + if pos != "MOD": continue - if graph.node[i]['word'] in past_tense_modals: - scoring_board['V11'] += 1 - add_one_point_if_needed('V9') + if graph.node[i]["word"] in past_tense_modals: + scoring_board["V11"] += 1 + add_one_point_if_needed("V9") - if turn_off_scoring_board('V11'): + if turn_off_scoring_board("V11"): break # noinspection PyPep8Naming @@ -589,12 +685,12 @@ def V12(graph): V12: Regular past tense suffix """ for i in range(1, graph.number_of_nodes()): - mor = graph.node[i]['mor'] + mor = graph.node[i]["mor"] - if '-PAST' in mor and '-PASTP' not in mor: - scoring_board['V12'] += 1 + if "-PAST" in mor and "-PASTP" not in mor: + scoring_board["V12"] += 1 - if turn_off_scoring_board('V12'): + if turn_off_scoring_board("V12"): break # noinspection PyPep8Naming @@ -603,17 +699,17 @@ def V13(graph): """ V13: Past tense auxiliary (Also credit V6) """ - aux_pos = {'AUX', 'MOD'} + aux_pos = {"AUX", "MOD"} for i in range(1, graph.number_of_nodes()): - mor = graph.node[i]['mor'] - pos = graph.node[i]['pos'] + mor = graph.node[i]["mor"] + pos = graph.node[i]["pos"] - if '&PAST' in mor and pos in aux_pos: - scoring_board['V13'] += 1 - add_one_point_if_needed('V6') + if "&PAST" in mor and pos in aux_pos: + scoring_board["V13"] += 1 + add_one_point_if_needed("V6") - if turn_off_scoring_board('V13'): + if turn_off_scoring_board("V13"): break # noinspection PyPep8Naming @@ -625,12 +721,12 @@ def V14(graph): for i in range(2, graph.number_of_nodes() - 1): # note the possible values of i for "medial" (not 1st or last word) - pos = graph.node[i]['pos'] - if pos == 'ADV': - scoring_board['V14'] += 1 - add_one_point_if_needed('V8') + pos = graph.node[i]["pos"] + if pos == "ADV": + scoring_board["V14"] += 1 + add_one_point_if_needed("V8") - if turn_off_scoring_board('V14'): + if turn_off_scoring_board("V14"): break # noinspection PyPep8Naming @@ -644,23 +740,23 @@ def V15(graph): return for i in range(1, graph.number_of_nodes() - 1): - pos1 = graph.node[i]['pos'] + pos1 = graph.node[i]["pos"] - if pos1 not in {'COP', 'AUX', 'MOD'}: + if pos1 not in {"COP", "AUX", "MOD"}: continue - mor2 = graph.node[i + 1]['mor'] + mor2 = graph.node[i + 1]["mor"] - if mor2 in {'', 'beg', 'end'}: # if mor2 is a punctuation - scoring_board['V15'] += 1 - add_one_point_if_needed('V4') - add_one_point_if_needed('V6') - add_one_point_if_needed('V9') - add_one_point_if_needed('V11') - add_one_point_if_needed('V13') - add_one_point_if_needed('V16') + if mor2 in {"", "beg", "end"}: # if mor2 is a punctuation + scoring_board["V15"] += 1 + add_one_point_if_needed("V4") + add_one_point_if_needed("V6") + add_one_point_if_needed("V9") + add_one_point_if_needed("V11") + add_one_point_if_needed("V13") + add_one_point_if_needed("V16") - if turn_off_scoring_board('V15'): + if turn_off_scoring_board("V15"): break # noinspection PyPep8Naming @@ -670,14 +766,14 @@ def V16(graph): V16: Past tense copula (Also credit V4) """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] - mor = graph.node[i]['mor'] + pos = graph.node[i]["pos"] + mor = graph.node[i]["mor"] - if pos.startswith('COP') and 'PAST' in mor: - scoring_board['V16'] += 1 - add_one_point_if_needed('V4') + if pos.startswith("COP") and "PAST" in mor: + scoring_board["V16"] += 1 + add_one_point_if_needed("V4") - if turn_off_scoring_board('V16'): + if turn_off_scoring_board("V16"): break # noinspection PyPep8Naming @@ -687,17 +783,17 @@ def Q1(graph): Q1: Intonationally marked question Automatically score 2 points if child earns 2 points on Q4 and/or Q8 """ - final_word = graph.node[graph.number_of_nodes() - 1]['word'] - if final_word != '?': + final_word = graph.node[graph.number_of_nodes() - 1]["word"] + if final_word != "?": return - first_word = graph.node[1]['word'] - if first_word in {'what', 'why', 'how', 'which', 'where', 'when'}: + first_word = graph.node[1]["word"] + if first_word in {"what", "why", "how", "which", "where", "when"}: return - scoring_board['Q1'] += 1 + scoring_board["Q1"] += 1 - if turn_off_scoring_board('Q1'): + if turn_off_scoring_board("Q1"): pass # noinspection PyPep8Naming @@ -709,18 +805,18 @@ def Q2(graph): """ # needs work here # currently only testing for wh-pronoun alone - final_word = graph.node[graph.number_of_nodes() - 1]['word'] - if final_word != '?': + final_word = graph.node[graph.number_of_nodes() - 1]["word"] + if final_word != "?": return - first_word = graph.node[1]['word'] - if first_word not in {'what', 'why', 'how', 'which', 'where', 'when'}: + first_word = graph.node[1]["word"] + if first_word not in {"what", "why", "how", "which", "where", "when"}: return if graph.number_of_nodes() > 2: - scoring_board['Q2'] += 1 + scoring_board["Q2"] += 1 - if turn_off_scoring_board('Q2'): + if turn_off_scoring_board("Q2"): pass # noinspection PyPep8Naming @@ -735,14 +831,17 @@ def Q3(graph): return for i in range(1, graph.number_of_nodes() - 1): - word1 = graph.node[i]['word'] - mor2 = graph.node[i + 1]['mor'] + word1 = graph.node[i]["word"] + mor2 = graph.node[i + 1]["mor"] - if (word1 in {'no', 'not', 'can\'t', 'don\'t'} - and mor2 not in {'', 'beg', 'end'}): - scoring_board['Q3'] += 1 + if word1 in {"no", "not", "can't", "don't"} and mor2 not in { + "", + "beg", + "end", + }: + scoring_board["Q3"] += 1 - if turn_off_scoring_board('Q3'): + if turn_off_scoring_board("Q3"): break # noinspection PyPep8Naming @@ -754,24 +853,24 @@ def Q4(graph): """ if not graph.number_of_nodes() > 2: return - final_word = graph.node[graph.number_of_nodes() - 1]['word'] - if final_word != '?': + final_word = graph.node[graph.number_of_nodes() - 1]["word"] + if final_word != "?": return - first_word = graph.node[1]['word'] - if first_word not in {'what', 'why', 'how', 'which', 'where', 'when'}: + first_word = graph.node[1]["word"] + if first_word not in {"what", "why", "how", "which", "where", "when"}: return root = graph.edges()[1] - if graph.node[root]['pos'] == 'V': - scoring_board['Q4'] += 1 + if graph.node[root]["pos"] == "V": + scoring_board["Q4"] += 1 - if turn_off_scoring_board('Q4'): - scoring_board['Q1'] = 2 - scoring_board['Q2'] = 2 - scoring_board_stop['Q1'] = True - scoring_board_stop['Q2'] = True + if turn_off_scoring_board("Q4"): + scoring_board["Q1"] = 2 + scoring_board["Q2"] = 2 + scoring_board_stop["Q1"] = True + scoring_board_stop["Q2"] = True # noinspection PyPep8Naming @test_item @@ -786,23 +885,23 @@ def Q5(graph): if dep > head: continue - rel = graph.edge[dep][head]['rel'] + rel = graph.edge[dep][head]["rel"] - if rel != 'SUBJ': + if rel != "SUBJ": continue - head_pos = graph.node[head]['pos'] + head_pos = graph.node[head]["pos"] - if head_pos != 'V': + if head_pos != "V": continue for i in range(dep + 1, head): # head > dep - if graph.node[i]['pos'] == 'NEG': - scoring_board['Q5'] += 1 - add_one_point_if_needed('Q3') + if graph.node[i]["pos"] == "NEG": + scoring_board["Q5"] += 1 + add_one_point_if_needed("Q3") break - if turn_off_scoring_board('Q5'): + if turn_off_scoring_board("Q5"): break # noinspection PyPep8Naming @@ -815,12 +914,12 @@ def Q6(graph): return for i in range(1, graph.number_of_nodes()): - if scoring_board_stop['Q6']: + if scoring_board_stop["Q6"]: break - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos not in {'COP', 'MOD', 'AUX'}: + if pos not in {"COP", "MOD", "AUX"}: continue for dep, head in graph.edges().items(): @@ -831,12 +930,12 @@ def Q6(graph): continue # we want "inversion" (= dep-wh comes before head-V) - pos_of_dep = graph.node[dep]['pos'] + pos_of_dep = graph.node[dep]["pos"] - if pos_of_dep == 'ADV:WH': - scoring_board['Q6'] += 1 + if pos_of_dep == "ADV:WH": + scoring_board["Q6"] += 1 - if turn_off_scoring_board('Q6'): + if turn_off_scoring_board("Q6"): break # noinspection PyPep8Naming @@ -849,22 +948,22 @@ def Q7(graph): return for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos not in {'MOD', 'COP', 'AUX'}: + if pos not in {"MOD", "COP", "AUX"}: continue for dep, head in graph.edges().items(): if head != i: continue - pos_of_dep = graph.node[dep]['pos'] + pos_of_dep = graph.node[dep]["pos"] - if pos_of_dep == 'NEG': - scoring_board['Q7'] += 1 - add_one_point_if_needed('Q5') + if pos_of_dep == "NEG": + scoring_board["Q7"] += 1 + add_one_point_if_needed("Q5") - if turn_off_scoring_board('Q7'): + if turn_off_scoring_board("Q7"): break # noinspection PyPep8Naming @@ -878,34 +977,34 @@ def Q8(graph): if not graph.number_of_nodes() > 2: return - final_word = graph.node[graph.number_of_nodes() - 1]['word'] - if final_word != '?': + final_word = graph.node[graph.number_of_nodes() - 1]["word"] + if final_word != "?": return for i in range(1, graph.number_of_nodes() - 1): - if scoring_board_stop['Q8']: + if scoring_board_stop["Q8"]: break - pos1 = graph.node[i]['pos'] + pos1 = graph.node[i]["pos"] if i != 1: - wh_test = graph.node[i - 1]['pos'] + wh_test = graph.node[i - 1]["pos"] else: - wh_test = 'dummy' + wh_test = "dummy" - if pos1 in {'COP', 'MOD', 'AUX'} and not wh_test.endswith('WH'): + if pos1 in {"COP", "MOD", "AUX"} and not wh_test.endswith("WH"): for j in graph.edge[i + 1].keys(): - rel2 = graph.edge[i + 1][j]['rel'] + rel2 = graph.edge[i + 1][j]["rel"] - if rel2 == 'SUBJ': - scoring_board['Q8'] += 1 + if rel2 == "SUBJ": + scoring_board["Q8"] += 1 - if turn_off_scoring_board('Q8'): - scoring_board['Q1'] = 2 - scoring_board['Q2'] = 2 - scoring_board_stop['Q1'] = True - scoring_board_stop['Q2'] = True + if turn_off_scoring_board("Q8"): + scoring_board["Q1"] = 2 + scoring_board["Q2"] = 2 + scoring_board_stop["Q1"] = True + scoring_board_stop["Q2"] = True break # noinspection PyPep8Naming @@ -914,13 +1013,13 @@ def Q9(graph): """ Q9: Why, when, which, whose """ - wh = {'why', 'when', 'which', 'whose'} + wh = {"why", "when", "which", "whose"} for i in range(1, graph.number_of_nodes()): - word = graph.node[i]['word'] + word = graph.node[i]["word"] if word in wh: - scoring_board['Q9'] += 1 + scoring_board["Q9"] += 1 - if turn_off_scoring_board('Q9'): + if turn_off_scoring_board("Q9"): break # noinspection PyPep8Naming @@ -933,31 +1032,31 @@ def Q10(graph): return # Part 1: test for ending "okay ?", "ok ?", "right ?" - final_word = graph.node[graph.number_of_nodes() - 1]['word'] - if final_word != '?': + final_word = graph.node[graph.number_of_nodes() - 1]["word"] + if final_word != "?": return - second_final_word = graph.node[graph.number_of_nodes() - 2]['word'] - if second_final_word in {'okay', 'ok', 'right'}: - scoring_board['Q10'] += 1 + second_final_word = graph.node[graph.number_of_nodes() - 2]["word"] + if second_final_word in {"okay", "ok", "right"}: + scoring_board["Q10"] += 1 - if turn_off_scoring_board('Q10'): + if turn_off_scoring_board("Q10"): return # Part 2: test for "normal" tag questions - good_pos = {'COP NEG PRO ?', 'COP PRO ?'} + good_pos = {"COP NEG PRO ?", "COP PRO ?"} collate = [] for i in range(1, graph.number_of_nodes()): - collate.append(graph.node[i]['pos']) + collate.append(graph.node[i]["pos"]) - test = ' '.join(collate) + test = " ".join(collate) for tag in good_pos: if tag in test: - scoring_board['Q10'] += 1 + scoring_board["Q10"] += 1 - if turn_off_scoring_board('Q10'): + if turn_off_scoring_board("Q10"): break # noinspection PyPep8Naming @@ -969,9 +1068,9 @@ def S1(graph): if not graph.number_of_nodes() > 2: return - scoring_board['S1'] += 1 + scoring_board["S1"] += 1 - if turn_off_scoring_board('S1'): + if turn_off_scoring_board("S1"): pass # noinspection PyPep8Naming @@ -987,18 +1086,18 @@ def S2(graph): if dep > head: continue - rel = graph.edge[dep][head]['rel'] + rel = graph.edge[dep][head]["rel"] - if rel != 'SUBJ': + if rel != "SUBJ": continue - head_pos = graph.node[head]['pos'] + head_pos = graph.node[head]["pos"] - if head_pos == 'V': - scoring_board['S2'] += 1 - add_one_point_if_needed('S1') + if head_pos == "V": + scoring_board["S2"] += 1 + add_one_point_if_needed("S1") - if turn_off_scoring_board('S2'): + if turn_off_scoring_board("S2"): break # noinspection PyPep8Naming @@ -1014,18 +1113,18 @@ def S3(graph): if dep < head: continue - rel = graph.edge[dep][head]['rel'] + rel = graph.edge[dep][head]["rel"] - if rel != 'OBJ': + if rel != "OBJ": continue - head_pos = graph.node[head]['pos'] + head_pos = graph.node[head]["pos"] - if head_pos == 'V': - scoring_board['S3'] += 1 - add_one_point_if_needed('S1') + if head_pos == "V": + scoring_board["S3"] += 1 + add_one_point_if_needed("S1") - if turn_off_scoring_board('S3'): + if turn_off_scoring_board("S3"): break # noinspection PyPep8Naming @@ -1038,9 +1137,9 @@ def S4(graph): return for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos != 'V': + if pos != "V": continue has_subject = False @@ -1050,20 +1149,24 @@ def S4(graph): if i != test_verb: continue - if (dep < test_verb - and graph.edge[dep][test_verb]['rel'] == 'SUBJ'): + if ( + dep < test_verb + and graph.edge[dep][test_verb]["rel"] == "SUBJ" + ): has_subject = True - if (dep > test_verb - and graph.edge[dep][test_verb]['rel'] == 'OBJ'): + if ( + dep > test_verb + and graph.edge[dep][test_verb]["rel"] == "OBJ" + ): has_object = True if has_subject and has_object: - scoring_board['S4'] += 1 - add_one_point_if_needed('S2') - add_one_point_if_needed('S3') + scoring_board["S4"] += 1 + add_one_point_if_needed("S2") + add_one_point_if_needed("S3") - if turn_off_scoring_board('S4'): + if turn_off_scoring_board("S4"): break # noinspection PyPep8Naming @@ -1073,12 +1176,12 @@ def S5(graph): S5: Conjunction (any) """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'CONJ': - scoring_board['S5'] += 1 + if pos == "CONJ": + scoring_board["S5"] += 1 - if turn_off_scoring_board('S5'): + if turn_off_scoring_board("S5"): break # noinspection PyPep8Naming @@ -1095,18 +1198,18 @@ def S6(graph): deps_of_verbs = [] for dep, head in all_edges.items(): - head_pos = graph.node[head]['pos'] + head_pos = graph.node[head]["pos"] - if head_pos != 'V': + if head_pos != "V": continue verbs.append(head) deps_of_verbs.append(dep) if len(verbs) == 2 and tuple(verbs) not in list(all_edges.items()): - scoring_board['S6'] += 1 + scoring_board["S6"] += 1 - if turn_off_scoring_board('S6'): + if turn_off_scoring_board("S6"): pass # noinspection PyPep8Naming @@ -1121,18 +1224,20 @@ def S7(graph): # for all trios, we want the middle word to be CONJ (for pos) # and the first+final words are *not* punctuation (for mor) for i in range(1, graph.number_of_nodes() - 2): - mor1 = graph.node[i]['mor'] - pos2 = graph.node[i + 1]['pos'] - mor3 = graph.node[i + 2]['mor'] - - punctuations = {'', 'beg', 'end'} - if (pos2 == 'CONJ' - and mor1 not in punctuations - and mor3 not in punctuations): - scoring_board['S7'] += 1 - add_one_point_if_needed('S5') - - if turn_off_scoring_board('S7'): + mor1 = graph.node[i]["mor"] + pos2 = graph.node[i + 1]["pos"] + mor3 = graph.node[i + 2]["mor"] + + punctuations = {"", "beg", "end"} + if ( + pos2 == "CONJ" + and mor1 not in punctuations + and mor3 not in punctuations + ): + scoring_board["S7"] += 1 + add_one_point_if_needed("S5") + + if turn_off_scoring_board("S7"): break # noinspection PyPep8Naming @@ -1150,9 +1255,9 @@ def S8(graph): # -- infinitive "to" with a head *not* being the main verb for dep, head in graph.edges().items(): - pos = graph.node[dep]['pos'] + pos = graph.node[dep]["pos"] - if pos != 'INF': + if pos != "INF": continue inf_verb = head @@ -1161,13 +1266,13 @@ def S8(graph): if inf_verb != test_verb: continue - if not graph.edge[inf_verb][new_head]['rel'].endswith('ROOT'): - scoring_board['S8'] += 1 - add_one_point_if_needed('S6') - add_one_point_if_needed('V5') + if not graph.edge[inf_verb][new_head]["rel"].endswith("ROOT"): + scoring_board["S8"] += 1 + add_one_point_if_needed("S6") + add_one_point_if_needed("V5") break - if turn_off_scoring_board('S8'): + if turn_off_scoring_board("S8"): break # noinspection PyPep8Naming @@ -1177,14 +1282,14 @@ def S9(graph): S9: Let/Make/Help/Watch introducer (also needs a dependent verb, according to the examples) """ - targets = {'let', 'make', 'help', 'watch'} + targets = {"let", "make", "help", "watch"} all_edges = graph.edges() for dep, head in all_edges.items(): if dep != 1: continue - if graph.node[dep]['word'] not in targets: + if graph.node[dep]["word"] not in targets: continue target_head = dep @@ -1193,11 +1298,11 @@ def S9(graph): if test_head != target_head: continue - if graph.node[test_dep]['pos'] == 'V': - scoring_board['S9'] += 1 + if graph.node[test_dep]["pos"] == "V": + scoring_board["S9"] += 1 break - if turn_off_scoring_board('S9'): + if turn_off_scoring_board("S9"): break # noinspection PyPep8Naming @@ -1207,17 +1312,17 @@ def S10(graph): S10: Adverbial conjunction (Also credit: S5) (conjunction excluding "and", "or", "then" -- according to examples) """ - exceptions = {'and', 'or', 'then'} + exceptions = {"and", "or", "then"} for i in range(1, graph.number_of_nodes()): - word = graph.node[i]['word'] - pos = graph.node[i]['pos'] + word = graph.node[i]["word"] + pos = graph.node[i]["pos"] - if pos == 'CONJ' and word not in exceptions: - scoring_board['S10'] += 1 - add_one_point_if_needed('S5') + if pos == "CONJ" and word not in exceptions: + scoring_board["S10"] += 1 + add_one_point_if_needed("S5") - if turn_off_scoring_board('S10'): + if turn_off_scoring_board("S10"): break # noinspection PyPep8Naming @@ -1234,16 +1339,18 @@ def S11(graph): for dep, head in graph.edges().items(): subject_count_increment = False - if (graph.edge[dep][head]['rel'] == 'SUBJ' - and graph.node[dep]['word'] != CLITIC): + if ( + graph.edge[dep][head]["rel"] == "SUBJ" + and graph.node[dep]["word"] != CLITIC + ): subject_count += 1 subject_count_increment = True if subject_count_increment and subject_count > 1: - scoring_board['S11'] += 1 - add_one_point_if_needed('S6') + scoring_board["S11"] += 1 + add_one_point_if_needed("S6") - if turn_off_scoring_board('S11'): + if turn_off_scoring_board("S11"): break # noinspection PyPep8Naming @@ -1257,19 +1364,19 @@ def S12(graph): return for dep, head in graph.edges().items(): - dep_word = graph.node[dep]['word'] + dep_word = graph.node[dep]["word"] - if dep_word != 'and': + if dep_word != "and": continue - rel = graph.edge[dep][head]['rel'] + rel = graph.edge[dep][head]["rel"] - if rel == 'CONJ' and graph.node[head]['pos'] == 'V': - scoring_board['S12'] += 1 - add_one_point_if_needed('S6') - add_one_point_if_needed('S5') + if rel == "CONJ" and graph.node[head]["pos"] == "V": + scoring_board["S12"] += 1 + add_one_point_if_needed("S6") + add_one_point_if_needed("S5") - if turn_off_scoring_board('S12'): + if turn_off_scoring_board("S12"): break # noinspection PyPep8Naming @@ -1282,32 +1389,34 @@ def S13(graph): return for dep, head in graph.edges().items(): - dep_pos = graph.node[dep]['pos'] - if not dep_pos.endswith('WH'): + dep_pos = graph.node[dep]["pos"] + if not dep_pos.endswith("WH"): continue inf = False - if (dep + 1 in graph.nodes() - and graph.node[dep + 1]['word'] == 'INF'): + if ( + dep + 1 in graph.nodes() + and graph.node[dep + 1]["word"] == "INF" + ): inf = True # we want the head of wh-word to NOT have ROOT as rel # (= ban a wh question) - rel = '' + rel = "" for i in graph.edge[head].keys(): - rel = graph.edge[head][i]['rel'] + rel = graph.edge[head][i]["rel"] break - if rel != 'ROOT': - scoring_board['S13'] += 1 - add_one_point_if_needed('S6') + if rel != "ROOT": + scoring_board["S13"] += 1 + add_one_point_if_needed("S6") if inf: - add_one_point_if_needed('S8') - add_one_point_if_needed('S17') + add_one_point_if_needed("S8") + add_one_point_if_needed("S17") - if turn_off_scoring_board('S13'): + if turn_off_scoring_board("S13"): break # noinspection PyPep8Naming @@ -1322,9 +1431,9 @@ def S14(graph): dep_head_pairs_for_obj = [] for dep, head in graph.edges().items(): - rel = graph.edge[dep][head]['rel'] + rel = graph.edge[dep][head]["rel"] - if rel != 'OBJ': + if rel != "OBJ": continue dep_head_pairs_for_obj.append((dep, head)) @@ -1332,10 +1441,10 @@ def S14(graph): heads = [head for _, head in dep_head_pairs_for_obj] if len(set(heads)) < len(dep_head_pairs_for_obj): - scoring_board['S14'] += 1 - add_one_point_if_needed('S3') + scoring_board["S14"] += 1 + add_one_point_if_needed("S3") - if turn_off_scoring_board('S14'): + if turn_off_scoring_board("S14"): pass # noinspection PyPep8Naming @@ -1347,14 +1456,19 @@ def S15(graph): if not graph.number_of_nodes() > 3: return - number_of_verbs = sum([1 for i in range(1, graph.number_of_nodes()) - if graph.node[i]['pos'] == 'V']) + number_of_verbs = sum( + [ + 1 + for i in range(1, graph.number_of_nodes()) + if graph.node[i]["pos"] == "V" + ] + ) if number_of_verbs > 2: - scoring_board['S15'] += 1 - add_one_point_if_needed('S6') + scoring_board["S15"] += 1 + add_one_point_if_needed("S6") - if turn_off_scoring_board('S15'): + if turn_off_scoring_board("S15"): pass # noinspection PyPep8Naming @@ -1374,20 +1488,20 @@ def S16(graph): if dep < head: continue - if graph.edge[dep][head]['rel'] != 'CMOD': + if graph.edge[dep][head]["rel"] != "CMOD": continue and_ = False for i in range(head + 1, dep): # dep > head - if graph.node[i]['word'] == 'and': + if graph.node[i]["word"] == "and": and_ = True break if not and_: - scoring_board['S16'] += 1 - add_one_point_if_needed('S6') + scoring_board["S16"] += 1 + add_one_point_if_needed("S6") - if turn_off_scoring_board('S16'): + if turn_off_scoring_board("S16"): break # noinspection PyPep8Naming @@ -1403,13 +1517,13 @@ def S17(graph): # ("me" is the new subject for the infinitive clause) for dep, head in graph.edges().items(): - word = graph.node[dep]['word'] - pos = graph.node[dep]['pos'] + word = graph.node[dep]["word"] + pos = graph.node[dep]["pos"] - if word != 'to' or pos != 'INF': + if word != "to" or pos != "INF": continue - inf_verb = head # "go" in the example + inf_verb = head # "go" in the example main_verb = graph.edges()[inf_verb] # "wants" # check if there's an object of "wants" @@ -1417,12 +1531,12 @@ def S17(graph): if test_main_verb != main_verb: continue - if graph.edge[test_obj][test_main_verb]['rel'] == 'OBJ': - scoring_board['S17'] += 1 - add_one_point_if_needed('S8') + if graph.edge[test_obj][test_main_verb]["rel"] == "OBJ": + scoring_board["S17"] += 1 + add_one_point_if_needed("S8") break - if turn_off_scoring_board('S17'): + if turn_off_scoring_board("S17"): break # noinspection PyPep8Naming @@ -1432,13 +1546,13 @@ def S18(graph): S18: Gerund (Also credit: V7) """ for i in range(1, graph.number_of_nodes()): - pos = graph.node[i]['pos'] + pos = graph.node[i]["pos"] - if pos == 'N:GERUND': - scoring_board['S18'] += 1 - add_one_point_if_needed('V7') + if pos == "N:GERUND": + scoring_board["S18"] += 1 + add_one_point_if_needed("V7") - if turn_off_scoring_board('S18'): + if turn_off_scoring_board("S18"): break # noinspection PyPep8Naming @@ -1454,23 +1568,23 @@ def S19(graph): subj_position_list = [] for dep, head in graph.edges().items(): - pos = graph.node[dep]['pos'] - rel = graph.edge[dep][head]['rel'] + pos = graph.node[dep]["pos"] + rel = graph.edge[dep][head]["rel"] - if pos == 'CONJ' and dep < conj_position: + if pos == "CONJ" and dep < conj_position: conj_position = dep - if rel == 'SUBJ': + if rel == "SUBJ": subj_position_list.append(dep) if len(subj_position_list) < 2: return if conj_position < min(subj_position_list): - scoring_board['S19'] += 1 - add_one_point_if_needed('S6') + scoring_board["S19"] += 1 + add_one_point_if_needed("S6") - if turn_off_scoring_board('S19'): + if turn_off_scoring_board("S19"): pass return sum(scoring_board.values()) diff --git a/pylangacq/tests/test_chat.py b/pylangacq/tests/test_chat.py index 4e4bbfc..f390b57 100644 --- a/pylangacq/tests/test_chat.py +++ b/pylangacq/tests/test_chat.py @@ -5,9 +5,6 @@ (and fix the bugs, if any). """ -from __future__ import print_function - -import sys import os import zipfile import tempfile @@ -18,19 +15,15 @@ from pylangacq import read_chat, Reader -if sys.version_info[0] == 2: # pragma: no coverage - from io import open - - _THIS_DIR = os.path.dirname(__file__) -REMOTE_BROWN_URL = 'https://childes.talkbank.org/data/Eng-NA/Brown.zip' -REMOTE_BROWN_ZIP_PATH = 'brown.zip' -REMOTE_EVE_DIR = os.path.abspath(os.path.join('Brown', 'Eve')) -REMOTE_EVE_FILE_PATH_1 = os.path.join(REMOTE_EVE_DIR, '010600a.cha') -REMOTE_EVE_FILE_PATH_2 = os.path.join(REMOTE_EVE_DIR, '010600b.cha') -REMOTE_EVE_FILE_PATH_ALL_FILES = os.path.join(REMOTE_EVE_DIR, '*.cha') -LOCAL_EVE_PATH = os.path.join(_THIS_DIR, 'test_data', 'eve.cha') +REMOTE_BROWN_URL = "https://childes.talkbank.org/data/Eng-NA/Brown.zip" +REMOTE_BROWN_ZIP_PATH = "brown.zip" +REMOTE_EVE_DIR = os.path.abspath(os.path.join("Brown", "Eve")) +REMOTE_EVE_FILE_PATH_1 = os.path.join(REMOTE_EVE_DIR, "010600a.cha") +REMOTE_EVE_FILE_PATH_2 = os.path.join(REMOTE_EVE_DIR, "010600b.cha") +REMOTE_EVE_FILE_PATH_ALL_FILES = os.path.join(REMOTE_EVE_DIR, "*.cha") +LOCAL_EVE_PATH = os.path.join(_THIS_DIR, "test_data", "eve.cha") def almost_equal(x, y, tolerance): @@ -39,22 +32,26 @@ def almost_equal(x, y, tolerance): return abs(x - y) <= tolerance -@pytest.mark.skipif('TRAVIS' not in os.environ, - reason='assuming Brown/ available, speed up local dev ' - 'for running tests without download') +@pytest.mark.skipif( + "CI" not in os.environ, + reason="assuming Brown/ available, speed up local dev " + "for running tests without download", +) def test_download_and_extract_brown_zip_file(): # pragma: no cover """pytest runs tests in the same order they are defined in the test module, and so this test for downloading and unzipping the Brown zip data file runs first. If download fails, abort all tests.""" try: - with open(REMOTE_BROWN_ZIP_PATH, 'wb') as f: + with open(REMOTE_BROWN_ZIP_PATH, "wb") as f: with requests.get(REMOTE_BROWN_URL) as r: f.write(r.content) except Exception as e: - msg = ('Error in downloading {}: ' - 'network problems or invalid URL for Brown zip? ' - 'If URL needs updating, tutorial.rst in docs ' - 'has to be updated as well.'.format(REMOTE_BROWN_URL)) + msg = ( + "Error in downloading {}: " + "network problems or invalid URL for Brown zip? " + "If URL needs updating, tutorial.rst in docs " + "has to be updated as well.".format(REMOTE_BROWN_URL) + ) try: raise e finally: @@ -69,45 +66,56 @@ def test_download_and_extract_brown_zip_file(): # pragma: no cover @pytest.fixture def eve_one_file(): - return read_chat(LOCAL_EVE_PATH, encoding='utf-8') + return read_chat(LOCAL_EVE_PATH, encoding="utf-8") @pytest.fixture def eve_all_files(): - return read_chat(REMOTE_EVE_FILE_PATH_ALL_FILES, encoding='utf-8') - - -@pytest.mark.parametrize('classmethod,arg', [ - pytest.param(Reader.from_chat_str, - open(LOCAL_EVE_PATH, encoding='utf-8').read(), - id='from_chat_str'), - pytest.param(Reader.from_chat_files, - LOCAL_EVE_PATH, - id='from_chat_files') -]) + return read_chat(REMOTE_EVE_FILE_PATH_ALL_FILES, encoding="utf-8") + + +@pytest.mark.parametrize( + "classmethod,arg", + [ + pytest.param( + Reader.from_chat_str, + open(LOCAL_EVE_PATH, encoding="utf-8").read(), + id="from_chat_str", + ), + pytest.param( + Reader.from_chat_files, LOCAL_EVE_PATH, id="from_chat_files" + ), + ], +) def test_instantiate_reader(classmethod, arg): """`read_chat` and the from_x classmethods works the same.""" - reader_from_classmethod = classmethod(arg, encoding='utf-8') - reader_from_read_chat = read_chat(REMOTE_EVE_FILE_PATH_1, encoding='utf-8') + from_classmethod = classmethod(arg, encoding="utf-8") + from_read_chat = read_chat(REMOTE_EVE_FILE_PATH_1, encoding="utf-8") # "header" and "index_to_tiers" combined cover the entire data file - header_from_classmethod = list(reader_from_classmethod.headers().values())[0] # noqa - header_from_read_chat = list(reader_from_read_chat.headers().values())[0] + header_from_classmethod = list(from_classmethod.headers().values())[0] + header_from_read_chat = list(from_read_chat.headers().values())[0] - index_to_tiers_from_classmethod = list(reader_from_classmethod.index_to_tiers().values())[0] # noqa - index_to_tiers_from_read_chat = list(reader_from_read_chat.index_to_tiers().values())[0] # noqa + index_to_tiers_from_classmethod = list( + from_classmethod.index_to_tiers().values() + )[0] + index_to_tiers_from_read_chat = list( + from_read_chat.index_to_tiers().values() + )[0] assert header_from_classmethod == header_from_read_chat - assert len(index_to_tiers_from_classmethod) == len(index_to_tiers_from_read_chat) # noqa + assert len(index_to_tiers_from_classmethod) == len( + index_to_tiers_from_read_chat + ) for (i_c, tier_c), (i_r, tier_r) in zip( - sorted(index_to_tiers_from_classmethod.items()), - sorted(index_to_tiers_from_read_chat.items()) + sorted(index_to_tiers_from_classmethod.items()), + sorted(index_to_tiers_from_read_chat.items()), ): try: assert tier_c == tier_r except AssertionError: - print('i_c:', i_c, 'i_r:', i_r) + print("i_c:", i_c, "i_r:", i_r) raise @@ -144,13 +152,13 @@ def test_add(eve_one_file): # Add a non-existing file => should throw an error with pytest.raises(ValueError): - eve_one_file.add('foo') + eve_one_file.add("foo") def test_remove(eve_one_file): # Remove a non-existing file => should throw an error with pytest.raises(ValueError): - eve_one_file.remove('foo') + eve_one_file.remove("foo") # Remove an existing file NOT in reader => should throw an error with tempfile.NamedTemporaryFile() as dummy_file: @@ -172,24 +180,25 @@ def test_clear(eve_one_file): def test_filenames(eve_all_files): - expected_filenames = [os.path.abspath(os.path.join(REMOTE_EVE_DIR, x)) - for x in sorted(os.listdir(REMOTE_EVE_DIR))] + expected_filenames = [ + os.path.abspath(os.path.join(REMOTE_EVE_DIR, x)) + for x in sorted(os.listdir(REMOTE_EVE_DIR)) + ] assert eve_all_files.filenames() == set(expected_filenames) assert eve_all_files.filenames(sorted_by_age=True) == expected_filenames def test_number_of_utterances(eve_one_file): - assert almost_equal(eve_one_file.number_of_utterances(), 1601, - tolerance=3) + assert almost_equal(eve_one_file.number_of_utterances(), 1601, tolerance=3) assert almost_equal( eve_one_file.number_of_utterances(by_files=True)[LOCAL_EVE_PATH], 1601, - tolerance=3 + tolerance=3, ) def test_participant_codes(eve_one_file): - expected_codes = {'CHI', 'MOT', 'COL', 'RIC'} + expected_codes = {"CHI", "MOT", "COL", "RIC"} assert eve_one_file.participant_codes() == expected_codes assert eve_one_file.participant_codes(by_files=True) == { LOCAL_EVE_PATH: expected_codes @@ -197,12 +206,13 @@ def test_participant_codes(eve_one_file): def test_languages(eve_one_file): - assert eve_one_file.languages() == {LOCAL_EVE_PATH: ['eng']} + assert eve_one_file.languages() == {LOCAL_EVE_PATH: ["eng"]} def test_dates_of_recording(eve_one_file): assert eve_one_file.dates_of_recording() == { - LOCAL_EVE_PATH: [(1962, 10, 15), (1962, 10, 17)]} + LOCAL_EVE_PATH: [(1962, 10, 15), (1962, 10, 17)] + } def test_age(eve_one_file): @@ -213,21 +223,26 @@ def test_age(eve_one_file): def test_words(eve_one_file): words = eve_one_file.words() assert almost_equal(len(words), 5843, tolerance=3) - assert words[:5] == ['more', 'cookie', '.', 'you', '0v'] + assert words[:5] == ["more", "cookie", ".", "you", "0v"] def test_tagged_words(eve_one_file): - tagged_words = eve_one_file.tagged_words(participant='MOT') + tagged_words = eve_one_file.tagged_words(participant="MOT") assert tagged_words[:2] == [ - ('you', 'PRO:PER', 'you', (1, 2, 'SUBJ')), - ('0v', '0V', 'v', (2, 0, 'ROOT')), + ("you", "PRO:PER", "you", (1, 2, "SUBJ")), + ("0v", "0V", "v", (2, 0, "ROOT")), ] def test_word_frequency(eve_all_files): word_freq = eve_all_files.word_frequency() - expected_top_five = [('.', 20130), ('?', 6358), ('you', 3695), - ('the', 2524), ('it', 2365)] + expected_top_five = [ + (".", 20130), + ("?", 6358), + ("you", 3695), + ("the", 2524), + ("it", 2365), + ] for expected, actual in zip(expected_top_five, word_freq.most_common(5)): expected_word, expected_freq = expected actual_word, actual_freq = actual @@ -237,9 +252,13 @@ def test_word_frequency(eve_all_files): def test_word_ngrams(eve_all_files): bigrams = eve_all_files.word_ngrams(2) - expected_top_five = [(('it', '.'), 705), (('that', '?'), 619), - (('what', '?'), 560), (('yeah', '.'), 510), - (('there', '.'), 471)] + expected_top_five = [ + (("it", "."), 705), + (("that", "?"), 619), + (("what", "?"), 560), + (("yeah", "."), 510), + (("there", "."), 471), + ] for expected, actual in zip(expected_top_five, bigrams.most_common(5)): expected_bigram, expected_freq = expected actual_bigram, actual_freq = actual @@ -249,132 +268,156 @@ def test_word_ngrams(eve_all_files): def test_participants(eve_one_file): assert eve_one_file.participants()[LOCAL_EVE_PATH] == { - 'CHI': {'SES': '', - 'age': '1;06.00', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Eve', - 'participant_role': 'Target_Child', - 'sex': 'female'}, - 'COL': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Colin', - 'participant_role': 'Investigator', - 'sex': ''}, - 'MOT': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Sue', - 'participant_role': 'Mother', - 'sex': 'female'}, - 'RIC': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Richard', - 'participant_role': 'Investigator', - 'sex': ''} + "CHI": { + "SES": "", + "age": "1;06.00", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Eve", + "participant_role": "Target_Child", + "sex": "female", + }, + "COL": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Colin", + "participant_role": "Investigator", + "sex": "", + }, + "MOT": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Sue", + "participant_role": "Mother", + "sex": "female", + }, + "RIC": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Richard", + "participant_role": "Investigator", + "sex": "", + }, } def test_headers(eve_one_file): assert eve_one_file.headers()[LOCAL_EVE_PATH] == { - 'Date': ['15-OCT-1962', '17-OCT-1962'], - 'Languages': 'eng', - 'PID': '11312/c-00034743-1', - 'Participants': {'CHI': {'SES': '', - 'age': '1;06.00', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Eve', - 'participant_role': 'Target_Child', - 'sex': 'female'}, - 'COL': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Colin', - 'participant_role': 'Investigator', - 'sex': ''}, - 'MOT': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Sue', - 'participant_role': 'Mother', - 'sex': 'female'}, - 'RIC': {'SES': '', - 'age': '', - 'corpus': 'Brown', - 'custom': '', - 'education': '', - 'group': '', - 'language': 'eng', - 'participant_name': 'Richard', - 'participant_role': 'Investigator', - 'sex': ''}}, - 'Tape Location': '850', - 'Time Duration': '11:30-12:00', - 'UTF8': '' + "Date": ["15-OCT-1962", "17-OCT-1962"], + "Languages": "eng", + "PID": "11312/c-00034743-1", + "Participants": { + "CHI": { + "SES": "", + "age": "1;06.00", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Eve", + "participant_role": "Target_Child", + "sex": "female", + }, + "COL": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Colin", + "participant_role": "Investigator", + "sex": "", + }, + "MOT": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Sue", + "participant_role": "Mother", + "sex": "female", + }, + "RIC": { + "SES": "", + "age": "", + "corpus": "Brown", + "custom": "", + "education": "", + "group": "", + "language": "eng", + "participant_name": "Richard", + "participant_role": "Investigator", + "sex": "", + }, + }, + "Tape Location": "850", + "Time Duration": "11:30-12:00", + "UTF8": "", } def test_sents(eve_one_file): assert eve_one_file.sents()[:2] == [ - ['more', 'cookie', '.'], ['you', '0v', 'more', 'cookies', '?'] + ["more", "cookie", "."], + ["you", "0v", "more", "cookies", "?"], ] def test_tagged_sents(eve_one_file): assert eve_one_file.tagged_sents()[:2] == [ - [('more', 'QN', 'more', (1, 2, 'QUANT')), - ('cookie', 'N', 'cookie', (2, 0, 'INCROOT')), - ('.', '.', '', (3, 2, 'PUNCT'))], - [('you', 'PRO:PER', 'you', (1, 2, 'SUBJ')), - ('0v', '0V', 'v', (2, 0, 'ROOT')), - ('more', 'QN', 'more', (3, 4, 'QUANT')), - ('cookies', 'N', 'cookie-PL', (4, 2, 'OBJ')), - ('?', '?', '', (5, 2, 'PUNCT'))] + [ + ("more", "QN", "more", (1, 2, "QUANT")), + ("cookie", "N", "cookie", (2, 0, "INCROOT")), + (".", ".", "", (3, 2, "PUNCT")), + ], + [ + ("you", "PRO:PER", "you", (1, 2, "SUBJ")), + ("0v", "0V", "v", (2, 0, "ROOT")), + ("more", "QN", "more", (3, 4, "QUANT")), + ("cookies", "N", "cookie-PL", (4, 2, "OBJ")), + ("?", "?", "", (5, 2, "PUNCT")), + ], ] def test_utterances(eve_one_file): assert eve_one_file.utterances()[:5] == [ - ('CHI', 'more cookie .'), - ('MOT', 'you 0v more cookies ?'), - ('MOT', 'how_about another graham+cracker ?'), - ('MOT', 'would that do just as_well ?'), - ('MOT', 'here .') + ("CHI", "more cookie ."), + ("MOT", "you 0v more cookies ?"), + ("MOT", "how_about another graham+cracker ?"), + ("MOT", "would that do just as_well ?"), + ("MOT", "here ."), ] def test_part_of_speech_tags(eve_all_files): - assert almost_equal(len(eve_all_files.part_of_speech_tags()), 62, - tolerance=2) + assert almost_equal( + len(eve_all_files.part_of_speech_tags()), 62, tolerance=2 + ) def test_mlu_m(eve_one_file): diff --git a/pylangacq/tests/test_dependency.py b/pylangacq/tests/test_dependency.py index 2e45b4b..60aa178 100644 --- a/pylangacq/tests/test_dependency.py +++ b/pylangacq/tests/test_dependency.py @@ -1,25 +1,25 @@ -# -*- coding: utf-8 -*- - from pylangacq.dependency import DependencyGraph _CHAT_GRAPH_DATA = [ - ('but', 'CONJ', 'but', (1, 3, 'LINK')), - ('I', 'PRO:SUB', 'I', (2, 3, 'SUBJ')), - ('thought', 'V', 'think&PAST', (3, 0, 'ROOT')), - ('you', 'PRO', 'you', (4, 3, 'OBJ')), - ('wanted', 'V', 'want-PAST', (5, 3, 'JCT')), - ('me', 'PRO:OBJ', 'me', (6, 5, 'POBJ')), - ('to', 'INF', 'to', (7, 8, 'INF')), - ('turn', 'V', 'turn', (8, 3, 'XCOMP')), - ('it', 'PRO', 'it', (9, 8, 'OBJ')), - ('.', '.', '', (10, 3, 'PUNCT')), + ("but", "CONJ", "but", (1, 3, "LINK")), + ("I", "PRO:SUB", "I", (2, 3, "SUBJ")), + ("thought", "V", "think&PAST", (3, 0, "ROOT")), + ("you", "PRO", "you", (4, 3, "OBJ")), + ("wanted", "V", "want-PAST", (5, 3, "JCT")), + ("me", "PRO:OBJ", "me", (6, 5, "POBJ")), + ("to", "INF", "to", (7, 8, "INF")), + ("turn", "V", "turn", (8, 3, "XCOMP")), + ("it", "PRO", "it", (9, 8, "OBJ")), + (".", ".", "", (10, 3, "PUNCT")), ] def test_dep_graph_to_tikz(): graph = DependencyGraph(_CHAT_GRAPH_DATA) - assert graph.to_tikz() == """ + assert ( + graph.to_tikz() + == """ \\begin{dependency}[theme = simple] \\begin{deptext}[column sep=1em] but \\& I \\& thought \\& you \\& wanted \\& me \\& to \\& turn \\& it \\& . \\\\ @@ -37,11 +37,14 @@ def test_dep_graph_to_tikz(): \\depedge{10}{3}{PUNCT} \\end{dependency} """.strip() # noqa + ) def test_dep_graph_to_conll(): graph = DependencyGraph(_CHAT_GRAPH_DATA) - assert graph.to_conll() == """ + assert ( + graph.to_conll() + == """ but CONJ 3 LINK I PRO:SUB 3 SUBJ thought V 0 ROOT @@ -53,3 +56,4 @@ def test_dep_graph_to_conll(): it PRO 8 OBJ . . 3 PUNCT """.strip() + ) diff --git a/pylangacq/tests/test_util.py b/pylangacq/tests/test_util.py index a458ea1..2300f90 100644 --- a/pylangacq/tests/test_util.py +++ b/pylangacq/tests/test_util.py @@ -1,92 +1,103 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - import pytest -from pylangacq.util import (clean_utterance, get_participant_code, clean_word, - convert_date_to_tuple, get_lemma_from_mor, - remove_extra_spaces, find_indices) - - -@pytest.mark.parametrize('original, expected', [ - ('[= foo ] bar', 'bar'), - ('[x 2] bar', 'bar'), - ('[+ foo ] bar', 'bar'), - ('[* foo ] bar', 'bar'), - ('[=? foo ] bar', 'bar'), - ('[=! foo ] bar', 'bar'), - ('[% foo ] bar', 'bar'), - ('[- foo ] bar', 'bar'), - ('[^ foo ] bar', 'bar'), - ('[<1] bar', 'bar'), - ('[<] bar', 'bar'), - ('[>1] bar', 'bar'), - ('[>] bar', 'bar'), - ('[<1] bar', 'bar'), - ('(1) bar', 'bar'), - ('(1.) bar', 'bar'), - ('(1.3) bar', 'bar'), - ('(1.34) bar', 'bar'), - ('(12.34) bar', 'bar'), - ('[%act: foo] bar', 'bar'), - ('[?] bar', 'bar'), - ('[!] bar', 'bar'), - ('‹ bar', 'bar'), - ('› bar', 'bar'), - ('bar', 'bar'), - ('[*] bar', 'bar'), - ('bar [*]', 'bar'), - ('“bar”', 'bar'), -]) +from pylangacq.util import ( + clean_utterance, + get_participant_code, + clean_word, + convert_date_to_tuple, + get_lemma_from_mor, + remove_extra_spaces, + find_indices, +) + + +@pytest.mark.parametrize( + "original, expected", + [ + ("[= foo ] bar", "bar"), + ("[x 2] bar", "bar"), + ("[+ foo ] bar", "bar"), + ("[* foo ] bar", "bar"), + ("[=? foo ] bar", "bar"), + ("[=! foo ] bar", "bar"), + ("[% foo ] bar", "bar"), + ("[- foo ] bar", "bar"), + ("[^ foo ] bar", "bar"), + ("[<1] bar", "bar"), + ("[<] bar", "bar"), + ("[>1] bar", "bar"), + ("[>] bar", "bar"), + ("[<1] bar", "bar"), + ("(1) bar", "bar"), + ("(1.) bar", "bar"), + ("(1.3) bar", "bar"), + ("(1.34) bar", "bar"), + ("(12.34) bar", "bar"), + ("[%act: foo] bar", "bar"), + ("[?] bar", "bar"), + ("[!] bar", "bar"), + ("‹ bar", "bar"), + ("› bar", "bar"), + ("bar", "bar"), + ("[*] bar", "bar"), + ("bar [*]", "bar"), + ("“bar”", "bar"), + ], +) def test_clean_utterance(original, expected): # TODO: Steps 3 and 5 in the function not tested assert clean_utterance(original) == expected -@pytest.mark.parametrize('keys, expected', [ - ({'CHI', '%mor', '%gra'}, 'CHI'), - ({'MOT', '%mor', '%gra'}, 'MOT'), -]) +@pytest.mark.parametrize( + "keys, expected", + [({"CHI", "%mor", "%gra"}, "CHI"), ({"MOT", "%mor", "%gra"}, "MOT")], +) def test_get_participant_code(keys, expected): assert get_participant_code(keys) == expected -@pytest.mark.parametrize('original, expected', [ - ('foo', 'foo'), - ('&foo', 'foo'), - ('foo@bar', 'foo'), - ('foo(', 'foo'), - ('foo)', 'foo'), - ('foo:', 'foo'), - ('foo;', 'foo'), - ('foo+', 'foo'), -]) +@pytest.mark.parametrize( + "original, expected", + [ + ("foo", "foo"), + ("&foo", "foo"), + ("foo@bar", "foo"), + ("foo(", "foo"), + ("foo)", "foo"), + ("foo:", "foo"), + ("foo;", "foo"), + ("foo+", "foo"), + ], +) def test_clean_word(original, expected): assert clean_word(original) == expected def test_convert_date_to_tuple(): - assert convert_date_to_tuple('01-FEB-2016') == (2016, 2, 1) + assert convert_date_to_tuple("01-FEB-2016") == (2016, 2, 1) def test_get_lemma_from_mor(): - assert get_lemma_from_mor('foo&bar-baz') == 'foo' + assert get_lemma_from_mor("foo&bar-baz") == "foo" -@pytest.mark.parametrize('original, expected', [ - ('foo bar', 'foo bar'), - ('foo bar baz', 'foo bar baz'), -]) +@pytest.mark.parametrize( + "original, expected", + [("foo bar", "foo bar"), ("foo bar baz", "foo bar baz")], +) def test_remove_extra_spaces(original, expected): assert remove_extra_spaces(original) == expected -@pytest.mark.parametrize('original, target, expected', [ - ('foo bar', 'foo', [0]), - ('foo foo bar', 'foo', [0, 4]), - ('foo bar foo', 'foo', [0, 8]), - ('foo bar baz', 'bar', [4]), -]) +@pytest.mark.parametrize( + "original, target, expected", + [ + ("foo bar", "foo", [0]), + ("foo foo bar", "foo", [0, 4]), + ("foo bar foo", "foo", [0, 8]), + ("foo bar baz", "bar", [4]), + ], +) def test_find_indices(original, target, expected): assert find_indices(original, target) == expected diff --git a/pylangacq/tests/test_version.py b/pylangacq/tests/test_version.py index 588f85f..df8d6ee 100644 --- a/pylangacq/tests/test_version.py +++ b/pylangacq/tests/test_version.py @@ -10,17 +10,14 @@ def test_version_number_match_with_changelog(): """__version__ and CHANGELOG.md match for the latest version number.""" - with open(os.path.join(_REPO_DIR, 'CHANGELOG.md')) as f: + with open(os.path.join(_REPO_DIR, "CHANGELOG.md")) as f: changelog = f.read() # latest version number in changelog = the 1st occurrence of '[x.y.z]' changelog_version = ( - re.search(r'\[\d+\.\d+\.\d+\]', changelog).group().strip('[]')) + re.search(r"\[\d+\.\d+\.\d+\]", changelog).group().strip("[]") + ) package_version = pylangacq.__version__ assert package_version == changelog_version, ( - 'Make sure both __version__ ({package_version}) and CHANGELOG ' - '({changelog_version}) are updated to match the latest version number' - .format( - package_version=package_version, - changelog_version=changelog_version - ) + f"Make sure both __version__ ({package_version}) and CHANGELOG " + f"({changelog_version}) are updated to match the latest version number" ) diff --git a/pylangacq/util.py b/pylangacq/util.py index dab983f..5a93373 100644 --- a/pylangacq/util.py +++ b/pylangacq/util.py @@ -1,13 +1,9 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - import re -CLITIC = 'CLITIC' -ENCODING = 'utf8' -TIMER_MARKER_REG = re.compile(r'\x15-?(\d+)_(\d+)-?\x15') +CLITIC = "CLITIC" +ENCODING = "utf8" +TIMER_MARKER_REG = re.compile(r"\x15-?(\d+)_(\d+)-?\x15") def clean_utterance(utterance, phon=False): @@ -57,28 +53,28 @@ def clean_utterance(utterance, phon=False): # print('utterance:', utterance, type(utterance)) - utterance = re.sub(r'\[= [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[x \d+?\]', '', utterance) - utterance = re.sub(r'\[\+ [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[\* [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[=\? [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[=! [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[% [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[- [^\[]+?\]', '', utterance) - utterance = re.sub(r'\[\^ [^\[]+?\]', '', utterance) - utterance = re.sub(r'[^]+?', '', utterance) # TODO: Why need this?! - utterance = re.sub(r'\[<\d?\]', '', utterance) - utterance = re.sub(r'\[>\d?\]', '', utterance) - utterance = re.sub(r'\(\d+?\.?\d*?\)', '', utterance) - utterance = re.sub(r'\[%act: [^\[]+?\]', '', utterance) - - utterance = re.sub(r'\[\?\]', '', utterance) - utterance = re.sub(r'\[\!\]', '', utterance) - utterance = re.sub(r'‹', '', utterance) - utterance = re.sub(r'›', '', utterance) - - utterance = re.sub(r'\[\*\] \[/', '[/', utterance) - utterance = re.sub(r'\] \[\*\]', ']', utterance) + utterance = re.sub(r"\[= [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[x \d+?\]", "", utterance) + utterance = re.sub(r"\[\+ [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[\* [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[=\? [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[=! [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[% [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[- [^\[]+?\]", "", utterance) + utterance = re.sub(r"\[\^ [^\[]+?\]", "", utterance) + utterance = re.sub(r"[^]+?", "", utterance) # TODO: Why need this?! + utterance = re.sub(r"\[<\d?\]", "", utterance) + utterance = re.sub(r"\[>\d?\]", "", utterance) + utterance = re.sub(r"\(\d+?\.?\d*?\)", "", utterance) + utterance = re.sub(r"\[%act: [^\[]+?\]", "", utterance) + + utterance = re.sub(r"\[\?\]", "", utterance) + utterance = re.sub(r"\[\!\]", "", utterance) + utterance = re.sub(r"‹", "", utterance) + utterance = re.sub(r"›", "", utterance) + + utterance = re.sub(r"\[\*\] \[/", "[/", utterance) + utterance = re.sub(r"\] \[\*\]", "]", utterance) utterance = remove_extra_spaces(utterance) # print('step 1:', utterance) @@ -98,18 +94,18 @@ def clean_utterance(utterance, phon=False): # (.) (short pause) # then pad them with extra spaces. - utterance = re.sub(r'<', ' <', utterance) - utterance = re.sub(r'\+ <', '+<', utterance) - utterance = re.sub(r'>', '> ', utterance) - utterance = re.sub(r'\[', ' [', utterance) - utterance = re.sub(r'\]', '] ', utterance) - utterance = re.sub(r'“', ' “ ', utterance) - utterance = re.sub(r'”', ' ” ', utterance) - utterance = re.sub(r',', ' , ', utterance) # works together with next line - utterance = re.sub(r'\+ ,', '+,', utterance) - utterance = re.sub(r'[^\[\./!]\?', ' ? ', utterance) + utterance = re.sub(r"<", " <", utterance) + utterance = re.sub(r"\+ <", "+<", utterance) + utterance = re.sub(r">", "> ", utterance) + utterance = re.sub(r"\[", " [", utterance) + utterance = re.sub(r"\]", "] ", utterance) + utterance = re.sub(r"“", " “ ", utterance) + utterance = re.sub(r"”", " ” ", utterance) + utterance = re.sub(r",", " , ", utterance) # works together with next line + utterance = re.sub(r"\+ ,", "+,", utterance) + utterance = re.sub(r"[^\[\./!]\?", " ? ", utterance) # utterance = re.sub('[^\(\[\.\+]\.', ' . ', utterance) - utterance = re.sub(r'\(\.\)', ' (.) ', utterance) + utterance = re.sub(r"\(\.\)", " (.) ", utterance) utterance = remove_extra_spaces(utterance) # print('step 2:', utterance) @@ -127,51 +123,56 @@ def clean_utterance(utterance, phon=False): # 3. Delete the unwanted words on the left of the signaling annotations. angle_brackets_l2r_pairs = {} # left-to-right - for index_ in find_indices(utterance, '<'): + for index_ in find_indices(utterance, "<"): counter = 1 for i in range(index_ + 1, len(utterance)): - if utterance[i] == '<': + if utterance[i] == "<": counter += 1 - elif utterance[i] == '>': + elif utterance[i] == ">": counter -= 1 if counter == 0: angle_brackets_l2r_pairs[index_] = i break - angle_brackets_r2l_pairs = {v: k - for k, v in angle_brackets_l2r_pairs.items()} + angle_brackets_r2l_pairs = { + v: k for k, v in angle_brackets_l2r_pairs.items() + } index_pairs = [] # characters bounded by index pairs to be removed # remove ' [///]' - triple_slash_right_indices = find_indices(utterance, r'> \[///\]') - index_pairs += [(begin + 1, begin + 6) - for begin in triple_slash_right_indices] + triple_slash_right_indices = find_indices(utterance, r"> \[///\]") + index_pairs += [ + (begin + 1, begin + 6) for begin in triple_slash_right_indices + ] # remove ' [//]' - double_overlap_right_indices = find_indices(utterance, r'> \[//\]') - index_pairs += [(begin + 1, begin + 5) - for begin in double_overlap_right_indices] + double_overlap_right_indices = find_indices(utterance, r"> \[//\]") + index_pairs += [ + (begin + 1, begin + 5) for begin in double_overlap_right_indices + ] # remove ' [/]' - single_overlap_right_indices = find_indices(utterance, r'> \[/\]') - index_pairs += [(begin + 1, begin + 4) - for begin in single_overlap_right_indices] + single_overlap_right_indices = find_indices(utterance, r"> \[/\]") + index_pairs += [ + (begin + 1, begin + 4) for begin in single_overlap_right_indices + ] # remove ' [/?]' - slash_question_indices = find_indices(utterance, r'> \[/\?\]') - index_pairs += [(begin + 1, begin + 4) - for begin in slash_question_indices] + slash_question_indices = find_indices(utterance, r"> \[/\?\]") + index_pairs += [(begin + 1, begin + 4) for begin in slash_question_indices] # remove ' [::' - double_error_right_indices = find_indices(utterance, r'> \[::') - index_pairs += [(begin + 1, begin + 4) - for begin in double_error_right_indices] + double_error_right_indices = find_indices(utterance, r"> \[::") + index_pairs += [ + (begin + 1, begin + 4) for begin in double_error_right_indices + ] # remove ' [:' - single_error_right_indices = find_indices(utterance, r'> \[: ') - index_pairs += [(begin + 1, begin + 3) - for begin in single_error_right_indices] + single_error_right_indices = find_indices(utterance, r"> \[: ") + index_pairs += [ + (begin + 1, begin + 3) for begin in single_error_right_indices + ] right_indices = ( double_overlap_right_indices @@ -182,59 +183,69 @@ def clean_utterance(utterance, phon=False): + slash_question_indices ) - index_pairs = index_pairs + [(angle_brackets_r2l_pairs[right], right) - for right in sorted(right_indices)] + index_pairs = index_pairs + [ + (angle_brackets_r2l_pairs[right], right) + for right in sorted(right_indices) + ] indices_to_ignore = set() for left, right in index_pairs: for i in range(left, right + 1): indices_to_ignore.add(i) - new_utterance = '' + new_utterance = "" for i in range(len(utterance)): if i not in indices_to_ignore: new_utterance += utterance[i] utterance = new_utterance - utterance = re.sub(r'\S+? \[/\]', '', utterance) - utterance = re.sub(r'\S+? \[//\]', '', utterance) - utterance = re.sub(r'\S+? \[///\]', '', utterance) - utterance = re.sub(r'\S+? \[/\?\]', '', utterance) + utterance = re.sub(r"\S+? \[/\]", "", utterance) + utterance = re.sub(r"\S+? \[//\]", "", utterance) + utterance = re.sub(r"\S+? \[///\]", "", utterance) + utterance = re.sub(r"\S+? \[/\?\]", "", utterance) - utterance = re.sub(r'\S+? \[::', '', utterance) - utterance = re.sub(r'\S+? \[:', '', utterance) + utterance = re.sub(r"\S+? \[::", "", utterance) + utterance = re.sub(r"\S+? \[:", "", utterance) utterance = remove_extra_spaces(utterance) # print('step 3:', utterance) # Step 4: Remove unwanted symbols - utterance = re.sub(r'“', '', utterance) - utterance = re.sub(r'”', '', utterance) + utterance = re.sub(r"“", "", utterance) + utterance = re.sub(r"”", "", utterance) utterance = remove_extra_spaces(utterance) # Step 5: Split utterance by spaces and determine whether to keep items. - escape_prefixes = {'[?', '[/', '[<', '[>', '[:', '[!', '[*', - '+"', '+,', '<&'} - escape_words = {'0', '++', '+<', '+^', - '(.)', '(..)', '(...)', - ':', ';'} - keep_prefixes = {'+"/', '+,/', '+".'} + escape_prefixes = { + "[?", + "[/", + "[<", + "[>", + "[:", + "[!", + "[*", + '+"', + "+,", + "<&", + } + escape_words = {"0", "++", "+<", "+^", "(.)", "(..)", "(...)", ":", ";"} + keep_prefixes = {'+"/', "+,/", '+".'} if not phon: - escape_words.update({'xxx', 'yyy', 'www', 'xxx:', 'yyy:'}) - escape_prefixes.update({'&'}) + escape_words.update({"xxx", "yyy", "www", "xxx:", "yyy:"}) + escape_prefixes.update({"&"}) else: - escape_words.update({','}) - escape_prefixes.update({'0'}) + escape_words.update({","}) + escape_prefixes.update({"0"}) words = utterance.split() new_words = [] for word in words: - word = re.sub(r'\A<', '', word) # remove beginning < - word = re.sub(r'>\Z', '', word) # remove final > - word = re.sub(r'\]\Z', '', word) # remove final ] + word = re.sub(r"\A<", "", word) # remove beginning < + word = re.sub(r">\Z", "", word) # remove final > + word = re.sub(r"\]\Z", "", word) # remove final ] not_an_escape_word = word not in escape_words no_escape_prefix = not any(word.startswith(e) for e in escape_prefixes) @@ -245,7 +256,7 @@ def clean_utterance(utterance, phon=False): # print('step 5:', remove_extra_spaces(' '.join(new_words))) - return remove_extra_spaces(' '.join(new_words)) + return remove_extra_spaces(" ".join(new_words)) def get_time_marker(utterance): # TODO write tests this function @@ -279,7 +290,7 @@ def get_time_marker(utterance): # TODO write tests this function stop = int(time_marker[1]) return start, stop else: - msg = 'Utterance does not have a standard time marker: %s' % utterance + msg = "Utterance does not have a standard time marker: %s" % utterance raise ValueError(msg) @@ -298,7 +309,7 @@ def get_participant_code(tier_marker_seq): Return ``None`` if no participant code is found. """ for tier_marker in tier_marker_seq: - if not tier_marker.startswith('%'): + if not tier_marker.startswith("%"): return tier_marker return None @@ -314,18 +325,18 @@ def clean_word(word): ------- str """ - new_word = (word - .replace('(', '') - .replace(')', '') - .replace(':', '') - .replace(';', '') - .replace('+', '') - ) - - if '@' in new_word: - new_word = new_word[: new_word.index('@')] - - if new_word.startswith('&'): + new_word = ( + word.replace("(", "") + .replace(")", "") + .replace(":", "") + .replace(";", "") + .replace("+", "") + ) + + if "@" in new_word: + new_word = new_word[: new_word.index("@")] + + if new_word.startswith("&"): new_word = new_word[1:] return new_word @@ -349,23 +360,23 @@ def convert_date_to_tuple(date_str): (2016, 2, 1) """ try: - day_str, month_str, year_str = date_str.split('-') + day_str, month_str, year_str = date_str.split("-") day = int(day_str) year = int(year_str) month_to_int = { - 'JAN': 1, - 'FEB': 2, - 'MAR': 3, - 'APR': 4, - 'MAY': 5, - 'JUN': 6, - 'JUL': 7, - 'AUG': 8, - 'SEP': 9, - 'OCT': 10, - 'NOV': 11, - 'DEC': 12, + "JAN": 1, + "FEB": 2, + "MAR": 3, + "APR": 4, + "MAY": 5, + "JUN": 6, + "JUL": 7, + "AUG": 8, + "SEP": 9, + "OCT": 10, + "NOV": 11, + "DEC": 12, } month = month_to_int[month_str] @@ -385,8 +396,8 @@ def get_lemma_from_mor(mor): ------- str """ - lemma, _, _ = mor.partition('-') - lemma, _, _ = lemma.partition('&') + lemma, _, _ = mor.partition("-") + lemma, _, _ = lemma.partition("&") return lemma @@ -401,8 +412,8 @@ def remove_extra_spaces(inputstr): ------- str """ - while ' ' in inputstr: - inputstr = inputstr.replace(' ', ' ') + while " " in inputstr: + inputstr = inputstr.replace(" ", " ") return inputstr.strip() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9e4fede --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +# These requirements are only for testing and local dev work. +# For tests +flake8==3.7.8 +pytest==5.1.2 +pytest-cov==2.7.1 +requests==2.22.0 +# +# For docs +alabaster==0.7.12 +Sphinx==2.2.0 +numpydoc==0.9.1 +# +# For PyPI releases +twine==1.13.0 diff --git a/setup.cfg b/setup.cfg index 884264f..716a043 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,10 +1,3 @@ -[metadata] -description-file = README.rst -license_file = LICENSE.txt - -[bdist_wheel] -universal=1 - [flake8] ignore = # E731: no lambda expressions diff --git a/setup.py b/setup.py index e95aca8..d85d72f 100644 --- a/setup.py +++ b/setup.py @@ -1,79 +1,82 @@ import os -import sys -import warnings +import setuptools -from setuptools import setup, find_packages + +if getattr(setuptools, "__version__", "0") < "39": + # v36.4.0+ needed to automatically include README.md in packaging + # v38.6.0+ needed for long_description_content_type in setup() + raise EnvironmentError( + "Your setuptools is too old. " + "Please run 'pip install --upgrade pip setuptools'." + ) -_PACKAGE_NAME = 'pylangacq' -_PYTHON_VERSION = sys.version_info[:3] _THIS_DIR = os.path.dirname(os.path.realpath(__file__)) -with open(os.path.join(_THIS_DIR, _PACKAGE_NAME, '_version.py')) as f: +with open(os.path.join(_THIS_DIR, "pylangacq", "_version.py")) as f: # get __version__ exec(f.read()) -with open(os.path.join(_THIS_DIR, 'README.rst')) as f: +with open(os.path.join(_THIS_DIR, "README.md")) as f: _LONG_DESCRIPTION = f.read().strip() def main(): - if _PYTHON_VERSION < (3, 5): - warnings.warn( - 'You are currently on Python {py_version}. ' - 'Python < 3.5 is deprecated and not supported ' - 'since pylangacq v0.11.0. '.format( - py_version='.'.join(_PYTHON_VERSION) - ), - DeprecationWarning - ) - - setup( - name=_PACKAGE_NAME, + setuptools.setup( + name="pylangacq", version=__version__, # noqa: F821 - description='PyLangAcq: Language Acquisition Research in Python', + description="PyLangAcq: Language Acquisition Research in Python", long_description=_LONG_DESCRIPTION, - url='http://pylangacq.org/', - author='Jackson Lee', - author_email='jacksonlunlee@gmail.com', - license='MIT License', - packages=find_packages(), - keywords=['computational linguistics', 'natural language processing', - 'NLP', 'linguistics', 'corpora', 'speech', - 'language', 'CHILDES', 'CHAT', 'transcription', - 'acquisition', 'development', 'learning'], - - package_data={'pylangacq': ['tests/test_data/*']}, - + long_description_content_type="text/markdown", + url="http://pylangacq.org/", + author="Jackson L. Lee", + author_email="jacksonlunlee@gmail.com", + license="MIT License", + packages=setuptools.find_packages(), + python_requires=">=3.6", + keywords=[ + "computational linguistics", + "natural language processing", + "NLP", + "linguistics", + "corpora", + "speech", + "language", + "CHILDES", + "CHAT", + "transcription", + "acquisition", + "development", + "learning", + ], + package_data={"pylangacq": ["tests/test_data/*"]}, zip_safe=False, - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: Information Technology', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Topic :: Scientific/Engineering :: Human Machine Interfaces', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Text Processing', - 'Topic :: Text Processing :: Filters', - 'Topic :: Text Processing :: General', - 'Topic :: Text Processing :: Indexing', - 'Topic :: Text Processing :: Linguistic' + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing", + "Topic :: Text Processing :: Filters", + "Topic :: Text Processing :: General", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", ], + data_files=[(".", ["LICENSE.txt", "CHANGELOG.md"])], ) -if __name__ == '__main__': +if __name__ == "__main__": main()