Skip to content

Commit

Permalink
The escape function no longer escapes \x00. It's not necessary.
Browse files Browse the repository at this point in the history
Inline flags can now be turned off and apply to what follows.

Added \R to match line endings.
  • Loading branch information
Matthew Barnett committed Dec 24, 2023
1 parent 34333d5 commit cdcbf36
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 58 deletions.
10 changes: 9 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:

env:
# macOS archs
CIBW_ARCHS_MACOS: "x86_64 arm64"
CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"

steps:
- uses: actions/checkout@v3
Expand All @@ -64,6 +64,14 @@ jobs:
name: regex-files
path: wheelhouse/*.whl

- name: Create GitHub release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
with:
tag_name: ${{ github.ref }}
title: Release ${{ github.ref }}

# Build source distribution & manylinux1_x86_64 wheels
# These two jobs build:
# 1, build_wheels (above): manylinux1_i686 / manylinux2014_x86_64
Expand Down
8 changes: 8 additions & 0 deletions changelog.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
Version: 2023.12.23

The escape function no longer escapes \x00. It's not necessary.

Inline flags can now be turned off and apply to what follows.

Added \R to match line endings.

Version: 2023.10.3

Updated to Unicode 15.1.0.
Expand Down
25 changes: 9 additions & 16 deletions regex_3/_regex_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1150,22 +1150,7 @@ def parse_flags_subpattern(source, info):

def parse_positional_flags(source, info, flags_on, flags_off):
"Parses positional flags."
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
if version == VERSION0:
# Positional flags are global and can only be turned on.
if flags_off:
raise error("bad inline flags: cannot turn flags off",
source.string, source.pos)

new_global_flags = flags_on & ~info.global_flags
if new_global_flags:
info.global_flags |= new_global_flags

# A global has been turned on, so reparse the pattern.
raise _UnscopedFlagSet(info.global_flags)
else:
info.flags = (info.flags | flags_on) & ~flags_off

info.flags = (info.flags | flags_on) & ~flags_off
source.ignore_space = bool(info.flags & VERBOSE)

def parse_name(source, allow_numeric=False, allow_group_0=False):
Expand Down Expand Up @@ -1233,6 +1218,14 @@ def parse_escape(source, info, in_set):
elif ch in "pP":
# A Unicode property, positive or negative.
return parse_property(source, info, ch == "p", in_set)
elif ch == "R" and not in_set:
# A line ending.
charset = [0x0A, 0x0B, 0x0C, 0x0D]
if info.guess_encoding == UNICODE:
charset.extend([0x85, 0x2028, 0x2029])

return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
for c in charset])]))
elif ch == "X" and not in_set:
# A grapheme cluster.
return Grapheme()
Expand Down
6 changes: 1 addition & 5 deletions regex_3/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@
"VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__",
"__doc__", "RegexFlag"]

__version__ = "2.5.135"
__version__ = "2.5.136"

# --------------------------------------------------------------------
# Public interface.
Expand Down Expand Up @@ -392,8 +392,6 @@ def escape(pattern, special_only=True, literal_spaces=False):
elif c in _METACHARS or c.isspace():
s.append("\\")
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append(c)
else:
Expand All @@ -402,8 +400,6 @@ def escape(pattern, special_only=True, literal_spaces=False):
s.append(c)
elif c in _ALNUM:
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append("\\")
s.append(c)
Expand Down
59 changes: 25 additions & 34 deletions regex_3/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,10 +911,9 @@ def test_inline_flags(self):
p = regex.compile('(?iu)' + lower_char)
self.assertEqual(bool(p.match(upper_char)), True)

# Changed to positional flags in regex 2023.12.23.
self.assertEqual(bool(regex.match(r"(?i)a", "A")), True)
self.assertEqual(bool(regex.match(r"a(?i)", "A")), True)
self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True)
self.assertEqual(regex.match(r"a(?iV1)", "A"), None)
self.assertEqual(regex.match(r"a(?i)", "A"), None)

def test_dollar_matches_twice(self):
# $ matches the end of string, and just before the terminating \n.
Expand Down Expand Up @@ -1396,18 +1395,15 @@ def test_scoped_and_inline_flags(self):
# Issues 433028, 433024, 433027.
self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2))
self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2))
self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2))
self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None)

self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda:
regex.search(r"(?V0-i)Ab", "ab", flags=regex.I))
# Changed to positional flags in regex 2023.12.23.
self.assertEqual(regex.search(r"A(?i)b", "ab"), None)

self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None)
self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None)
self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None)
self.assertEqual(regex.search(r"(?-i)Ab", "ab", flags=regex.I), None)
self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None)
self.assertEqual(regex.search(r"A(?V1-i)b", "ab",
flags=regex.I).span(), (0, 2))
self.assertEqual(regex.search(r"A(?-i)b", "ab", flags=regex.I).span(),
(0, 2))

def test_repeated_repeats(self):
# Issue 2537.
Expand Down Expand Up @@ -1820,12 +1816,10 @@ def test_various(self):
('a.*b', 'acc\nccb', '', ascii(None)),
('a.{4,5}b', 'acc\nccb', '', ascii(None)),
('a.b', 'a\rb', '0', ascii('a\rb')),
# The new behaviour is that the inline flag affects only what follows.
('a.b(?s)', 'a\nb', '0', ascii('a\nb')),
('a.b(?sV1)', 'a\nb', '', ascii(None)),
# Changed to positional flags in regex 2023.12.23.
('a.b(?s)', 'a\nb', '', ascii(None)),
('(?s)a.b', 'a\nb', '0', ascii('a\nb')),
('a.*(?s)b', 'acc\nccb', '0', ascii('acc\nccb')),
('a.*(?sV1)b', 'acc\nccb', '', ascii(None)),
('a.*(?s)b', 'acc\nccb', '', ascii(None)),
('(?s)a.*b', 'acc\nccb', '0', ascii('acc\nccb')),
('(?s)a.{4,5}b', 'acc\nccb', '0', ascii('acc\nccb')),

Expand Down Expand Up @@ -2345,12 +2339,9 @@ def test_various(self):
# Not an error under PCRE/PRE:
# When the new behaviour is turned on positional inline flags affect
# only what follows.
('w(?i)', 'W', '0', ascii('W')),
('w(?iV1)', 'W', '0', ascii(None)),
('w(?i)', 'W', '0', ascii(None)),
('w(?i)', 'w', '0', ascii('w')),
('w(?iV1)', 'w', '0', ascii('w')),
('(?i)w', 'W', '0', ascii('W')),
('(?iV1)w', 'W', '0', ascii('W')),

# Comments using the x embedded pattern modifier.
("""(?x)w# comment 1
Expand Down Expand Up @@ -2403,14 +2394,10 @@ def test_various(self):
# Bug 114033: nothing to repeat.
(r'(x?)?', 'x', '0', ascii('x')),
# Bug 115040: rescan if flags are modified inside pattern.
# If the new behaviour is turned on then positional inline flags
# affect only what follows.
(r' (?x)foo ', 'foo', '0', ascii('foo')),
(r' (?V1x)foo ', 'foo', '0', ascii(None)),
# Changed to positional flags in regex 2023.12.23.
(r' (?x)foo ', 'foo', '0', ascii(None)),
(r'(?x) foo ', 'foo', '0', ascii('foo')),
(r'(?V1x) foo ', 'foo', '0', ascii('foo')),
(r'(?x)foo ', 'foo', '0', ascii('foo')),
(r'(?V1x)foo ', 'foo', '0', ascii('foo')),
# Bug 115618: negative lookahead.
(r'(?<!abc)(d.f)', 'abcdefdof', '0', ascii('dof')),
# Bug 116251: character class bug.
Expand Down Expand Up @@ -3154,10 +3141,8 @@ def test_hg_bugs(self):

# Hg issue 39: regex.search("((?i)blah)\\s+\\1", "blah BLAH") doesn't
# return None
self.assertEqual(regex.search(r"(?V0)((?i)blah)\s+\1",
"blah BLAH").group(0, 1), ("blah BLAH", "blah"))
self.assertEqual(regex.search(r"(?V1)((?i)blah)\s+\1", "blah BLAH"),
None)
# Changed to positional flags in regex 2023.12.23.
self.assertEqual(regex.search(r"((?i)blah)\s+\1", "blah BLAH"), None)

# Hg issue 40: regex.search("(\()?[^()]+(?(1)\)|)", "(abcd").group(0)
# returns "bcd" instead of "abcd"
Expand Down Expand Up @@ -4336,10 +4321,10 @@ def test_hg_bugs(self):
self.assertEqual(regex.search(r"^a?(a?)b?c\1$", "abca").span(), (0, 4))

# Git issue 498: Conditional negative lookahead inside positive lookahead fails to match
self.assertEqual(regex.match(r"(?(?=a).|..)", "ab").span(), (0, 1))
self.assertEqual(regex.match(r"(?(?=b).|..)", "ab").span(), (0, 2))
self.assertEqual(regex.match(r"(?(?!a).|..)", "ab").span(), (0, 2))
self.assertEqual(regex.match(r"(?(?!b).|..)", "ab").span(), (0, 1))
self.assertEqual(regex.match(r'(?(?=a).|..)', 'ab').span(), (0, 1))
self.assertEqual(regex.match(r'(?(?=b).|..)', 'ab').span(), (0, 2))
self.assertEqual(regex.match(r'(?(?!a).|..)', 'ab').span(), (0, 2))
self.assertEqual(regex.match(r'(?(?!b).|..)', 'ab').span(), (0, 1))

def test_fuzzy_ext(self):
self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', 'e')),
Expand Down Expand Up @@ -4460,6 +4445,12 @@ def test_more_zerowidth(self):
self.assertEqual([m.span() for m in regex.finditer(r'(?m)^\s*?$',
'foo\n\n\nbar')], [(4, 4), (4, 5), (5, 5)])

def test_line_ending(self):
self.assertEqual(regex.findall(r'\R', '\r\n\n\x0B\f\r\x85\u2028\u2029'),
['\r\n', '\n', '\x0B', '\f', '\r', '\x85', '\u2028', '\u2029'])
self.assertEqual(regex.findall(br'\R', b'\r\n\n\x0B\f\r\x85'), [b'\r\n',
b'\n', b'\x0B', b'\f', b'\r'])

def test_main():
unittest.main(verbosity=2)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='regex',
version='2023.10.3',
version='2023.12.23',
description='Alternative regular expression module, to replace re.',
long_description=long_description,
long_description_content_type='text/x-rst',
Expand Down
2 changes: 1 addition & 1 deletion tools/build_regex_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1781,4 +1781,4 @@ def make_key(names):

generate_code(unicode_data, UNICODE_VERSION, this_folder)

print('\nSuccessfully generated _reges_unicode.h and _reges_unicode.c in %s' % tools_folder)
print('\nSuccessfully generated _regex_unicode.h and _regex_unicode.c in %s' % tools_folder)

0 comments on commit cdcbf36

Please sign in to comment.