Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-91524: Speed up the regular expression substitution #91525

Merged
merged 11 commits into from
Oct 23, 2022
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ Optimizations
process, which improves performance by 1-5%.
(Contributed by Kevin Modzelewski in :gh:`90536`.)

* Speed up the regular expression substitution (functions :func:`re.sub` and
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
replacement strings containing group references by 2--3 times.
(Contributed by Serhiy Storchaka in :gh:`91524`.)


CPython bytecode changes
========================
Expand Down
22 changes: 4 additions & 18 deletions Lib/re/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
import enum
from . import _compiler, _parser
import functools
import _sre


# public symbols
Expand Down Expand Up @@ -230,7 +231,7 @@ def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_repl.cache_clear()
_compile_template.cache_clear()

def template(pattern, flags=0):
"Compile a template pattern, returning a Pattern object, deprecated"
Expand Down Expand Up @@ -328,24 +329,9 @@ def _compile(pattern, flags):
return p

@functools.lru_cache(_MAXCACHE)
def _compile_repl(repl, pattern):
def _compile_template(pattern, repl):
gpshead marked this conversation as resolved.
Show resolved Hide resolved
# internal: compile replacement pattern
return _parser.parse_template(repl, pattern)

def _expand(pattern, match, template):
# internal: Match.expand implementation hook
template = _parser.parse_template(template, pattern)
return _parser.expand_template(template, match)

def _subx(pattern, template):
# internal: Pattern.sub/subn implementation helper
template = _compile_repl(template, pattern)
if not template[0] and len(template[1]) == 1:
# literal replacement
return template[1][0]
def filter(match, template=template):
return _parser.expand_template(template, match)
return filter
return _sre.template(pattern, _parser.parse_template(repl, pattern))

# register myself for pickling

Expand Down
2 changes: 1 addition & 1 deletion Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20220615
MAGIC = 20221023

from _sre import MAXREPEAT, MAXGROUPS

Expand Down
45 changes: 16 additions & 29 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):

return p

def parse_template(source, state):
def parse_template(source, pattern):
gpshead marked this conversation as resolved.
Show resolved Hide resolved
# parse 're' replacement string into list of literals and
# group references
s = Tokenizer(source)
sget = s.get
groups = []
literals = []
result = []
literal = []
lappend = literal.append
def addliteral():
if s.istext:
result.append(''.join(literal))
else:
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
result.append(''.join(literal).encode('latin-1'))
del literal[:]
def addgroup(index, pos):
if index > state.groups:
if index > pattern.groups:
raise s.error("invalid group reference %d" % index, pos)
if literal:
literals.append(''.join(literal))
del literal[:]
groups.append((len(literals), index))
literals.append(None)
groupindex = state.groupindex
addliteral()
result.append(index)
groupindex = pattern.groupindex
while True:
this = sget()
if this is None:
Expand Down Expand Up @@ -1063,22 +1067,5 @@ def addgroup(index, pos):
lappend(this)
else:
lappend(this)
if literal:
literals.append(''.join(literal))
if not isinstance(source, str):
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
literals = [None if s is None else s.encode('latin-1') for s in literals]
return groups, literals

def expand_template(template, match):
g = match.group
empty = match.string[:0]
groups, literals = template
literals = literals[:]
try:
for index, group in groups:
literals[index] = g(group) or empty
except IndexError:
raise error("invalid group reference %d" % index) from None
return empty.join(literals)
addliteral()
return result
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Speed up the regular expression substitution (functions :func:`re.sub` and
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
replacement strings containing group references by 2--3 times.
41 changes: 40 additions & 1 deletion Modules/_sre/clinic/sre.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading