Skip to content

Commit

Permalink
Merge branch 'master' into typed
Browse files Browse the repository at this point in the history
  • Loading branch information
trim21 authored Oct 28, 2024
2 parents 3190837 + 57cfbda commit 4eaa725
Show file tree
Hide file tree
Showing 18 changed files with 354 additions and 144 deletions.
43 changes: 34 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ on:
tags: ["*"]
pull_request:
branches: [master]
schedule:
- cron: "30 16 1 * *"
workflow_dispatch:

env:
Expand Down Expand Up @@ -33,6 +35,7 @@ jobs:
- "3.10"
- "3.11"
- "3.12"
- "3.13"
- "pypy-2.7"
- "pypy-3.7"
- "pypy-3.8"
Expand All @@ -48,13 +51,26 @@ jobs:
- os: ubuntu-20.04
container: python:2.7-buster
python-version: "2.7"
exclude:
- os: macos-latest
python-version: "3.7"
- os: macos-latest
python-version: "pypy-3.7"

runs-on: ${{ matrix.os }}
container: ${{ matrix.container }}

steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4

- name: Ignore certificate verification on python 3.5
shell: bash
run: |
# INSECURE!! But it should be OK for CI tests.
echo 'PIP_TRUSTED_HOST=pypi.python.org pypi.org files.pythonhosted.org' >>$GITHUB_ENV
if: 'matrix.python-version == 3.5'

- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
Expand Down Expand Up @@ -117,10 +133,11 @@ jobs:
fi
- name: Upload coverage data
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-data
name: coverage-data.${{ matrix.os }}-${{ matrix.python-version }}
path: .coverage.*
include-hidden-files: true
if-no-files-found: ignore

coverage:
Expand All @@ -129,14 +146,22 @@ jobs:
needs: tests

steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version-file: .python-version-default
cache: pip

- name: Merge coverage data artifacts
uses: actions/upload-artifact/merge@v4
with:
name: coverage-data
pattern: coverage-data.*
include-hidden-files: true
delete-merged: true

- name: Download coverage data
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: coverage-data

Expand All @@ -152,7 +177,7 @@ jobs:
python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}}

Expand All @@ -162,7 +187,7 @@ jobs:
python -Im coverage report --fail-under=100
- name: Upload HTML report if check failed.
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: html-report
path: htmlcov
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
pull_request:
branches: [ "master" ]
schedule:
- cron: "36 4 * * 2"
- cron: "36 4 1 * *"

jobs:
analyze:
Expand Down
2 changes: 1 addition & 1 deletion .python-version-default
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.11
3.12
57 changes: 36 additions & 21 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pathlib import Path
from dataclasses import field, fields, dataclass

from typing import Any, Mapping, Iterable, Iterator, Sequence, Container, Collection
from typing import Any, Mapping, Iterable, Iterator, Sequence, Collection

try:
from typing import Self
Expand Down Expand Up @@ -54,6 +54,19 @@
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))

# Hangul Jamo is a decomposed form of Hangul Syllables, see
# see https://www.unicode.org/faq/korean.html#3
# https://github.com/ridiculousfish/widecharwidth/pull/17
# https://github.com/jquast/ucs-detect/issues/9
# https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
# "Conjoining Jamo are divided into three classes: L, V, T (Leading
# consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
# <LV> or <LVT> sequences."
HANGUL_JAMO_ZEROWIDTH = (
*range(0x1160, 0x1200), # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
*range(0xD7B0, 0xD800), # Hangul Jungseong O-Yeo .. Undefined Character of Hangul Jamo Extended-B
)


def _bisearch(ucs, table):
"""A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
Expand All @@ -77,7 +90,7 @@ def _bisearch(ucs, table):

@dataclass(order=True, frozen=True)
class UnicodeVersion:
"""A class for camparable unicode version."""
"""A class for comparable unicode version."""
major: int
minor: int
micro: int | None
Expand Down Expand Up @@ -112,11 +125,11 @@ class TableEntry:
properties: tuple[str, ...]
comment: str

def filter_by_category(self, category_codes: str, wide: int) -> bool:
def filter_by_category_width(self, wide: int) -> bool:
"""
Return whether entry matches given category code and displayed width.
Return whether entry matches displayed width.
Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
"""
if self.code_range is None:
return False
Expand Down Expand Up @@ -146,13 +159,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
return wide == 1

@staticmethod
def parse_category_values(category_codes: str,
table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
def parse_width_category_values(table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
"""Parse value ranges of unicode data files, by given category and width."""
return {n
for entry in table_iter
if entry.filter_by_category(category_codes, wide)
if entry.filter_by_category_width(wide)
for n in list(range(entry.code_range[0], entry.code_range[1]))}


Expand Down Expand Up @@ -326,18 +338,19 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# parse typical 'wide' characters by categories 'W' and 'F',
table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
category_codes=('W', 'F'),
wide=2)

# subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
# but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Mn', 'Mc'),
wide=0).values)
table[version].values = table[version].values.difference(parse_category(
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Sk',),
wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)

Expand All @@ -352,11 +365,13 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# Determine values of zero-width character lookup table by the following category codes
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
wide=0)

# And, include NULL
# Include NULL
table[version].values.add(0)

# Add Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down Expand Up @@ -482,7 +497,7 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:


def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
"""Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
"""Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
hex_str_vs16 = 'FE0F'
for line in fp:
data, _, comment = line.partition('#')
Expand All @@ -496,14 +511,14 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
continue
code_points = code_points_str.split()
if len(code_points) == 2 and code_points[1] == hex_str_vs16:
# yeild a single "code range" entry for a single value that preceeds FE0F
# yield a single "code range" entry for a single value that precedes FE0F
yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)


@functools.cache
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
def parse_category(fname: str, wide: int) -> TableDef:
"""Parse value ranges of unicode data files, by given categories into string tables."""
print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
print(f'parsing {fname}, wide={wide}: ', end='', flush=True)

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
Expand All @@ -512,7 +527,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
values = TableEntry.parse_width_category_values(table_iter, wide)
print('ok')
return TableDef(version, date, values)

Expand Down
48 changes: 38 additions & 10 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,30 @@
import logging


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
lbound = 0
ubound = len(table) - 1

if ucs < table[0][0] or ucs > table[ubound][1]:
return (0, None, None)
while ubound >= lbound:
mid = (lbound + ubound) // 2
if ucs > table[mid][1]:
lbound = mid + 1
elif ucs < table[mid][0]:
ubound = mid - 1
else:
return (1, table[mid][0], table[mid][1])

return (0, None, None)


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
tables = {'ZERO_WIDTH': ZERO_WIDTH,
'WIDE_EASTASIAN': WIDE_EASTASIAN}
Expand All @@ -81,14 +102,21 @@ def main(log: logging.Logger):
other_table = tables[other_table_name][version]
for start_range, stop_range in curr_table:
for unichar_n in range(start_range, stop_range):
if not _bisearch(unichar_n, next_table):
log.info(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
if _bisearch(unichar_n, other_table):
log.error(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
result, _, _ = bisearch_pair(unichar_n, next_table)
if not result:
log.info(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
)
result, lbound, ubound = bisearch_pair(unichar_n, other_table)
if result:
log.error(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
)
errors += 1
if errors:
log.error(f'{errors} errors, exit 1')
Expand Down
2 changes: 1 addition & 1 deletion bin/wcwidth-browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __init__(self, width, unicode_version):
"""
self.characters = []
letters_o = ('o' * width)
for (begin, end) in ZERO_WIDTH[unicode_version]:
for (begin, end) in ZERO_WIDTH[_wcmatch_version(unicode_version)]:
for val in [_val for _val in
range(begin, end + 1)
if _val <= LIMIT_UCS]:
Expand Down
Empty file.
14 changes: 11 additions & 3 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Example
>>> text = u'コンニチハ'

Python **incorrectly** uses the *string length* of 5 codepoints rather than the
*printible length* of 10 cells, so that when using the `rjust` function, the
*printable length* of 10 cells, so that when using the `rjust` function, the
output length is wrong::

>>> print(len('コンニチハ'))
Expand Down Expand Up @@ -216,8 +216,15 @@ Other Languages
=======
History
=======

0.2.13 *2024-01-06*
* **Bugfix** zero-width support for Hangul Jamo (Korean)

0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

0.2.11 *2023-11-20*
* Include tests files in the source distibution (`PR #98`_, `PR #100`_).
* Include tests files in the source distribution (`PR #98`_, `PR #100`_).

0.2.10 *2023-11-13*
* **Bugfix** accounting of some kinds of emoji sequences using U+FE0F
Expand All @@ -231,7 +238,7 @@ History
character measurements.

0.2.8 *2023-09-30*
* Include requirements files in the source distibution (`PR #82`_).
* Include requirements files in the source distribution (`PR #82`_).

0.2.7 *2023-09-28*
* **Updated** tables to include Unicode Specification 15.1.0.
Expand Down Expand Up @@ -330,6 +337,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
.. _`PR #97`: https://github.com/jquast/wcwidth/pull/97
.. _`PR #98`: https://github.com/jquast/wcwidth/pull/98
.. _`PR #100`: https://github.com/jquast/wcwidth/pull/100
.. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
.. _`jquast/blessed`: https://github.com/jquast/blessed
.. _`selectel/pyte`: https://github.com/selectel/pyte
.. _`thomasballinger/curtsies`: https://github.com/thomasballinger/curtsies
Expand Down
Loading

0 comments on commit 4eaa725

Please sign in to comment.