Skip to content

Commit

Permalink
pythonGH-105113: Improve performance of pathlib.PurePath.match()
Browse files Browse the repository at this point in the history
We now compile a `re.Pattern` object for the entire pattern. This is made
more difficult by `fnmatch` not treating directory separators as special
when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts
onto separate *lines* in a string, and ensure we don't set `re.DOTALL`.

Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com>
Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
  • Loading branch information
3 people committed May 30, 2023
1 parent 9ae49e3 commit b100cef
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 14 deletions.
7 changes: 7 additions & 0 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
>>> PurePath('a/b.py').match('/*.py')
False

The *pattern* may be another path object; this speeds up matching the same
pattern against multiple files::

>>> pattern = PurePath('*.py')
>>> PurePath('a/b.py').match(pattern)
True

As with other methods, case-sensitivity follows platform defaults::

>>> PurePosixPath('b.py').match('*.PY')
Expand Down
86 changes: 72 additions & 14 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,30 @@ def _ignore_error(exception):
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)


@functools.cache
def _is_case_sensitive(flavour):
return flavour.normcase('Aa') == 'Aa'

#
# Globbing helpers
#


# fnmatch.translate() returns a regular expression that includes a prefix and
# a suffix, which enable matching newlines and ensure the end of the string is
# matched, respectively. These features are undesirable for our implementation
# of PurePatch.match(), which represents path separators as newlines and joins
# pattern segments together. As a workaround, we define a slice object that
# can remove the prefix and suffix from any translate() result. See the
# _compile_pattern_lines() function for more details.
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
_SWAP_SEP_AND_NEWLINE = {
'/': str.maketrans({'/': '\n', '\n': '/'}),
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
}


@functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0]
Expand Down Expand Up @@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive):
return re.compile(fnmatch.translate(pat), flags).match


@functools.lru_cache()
def _compile_pattern_lines(pattern_lines, case_sensitive):
"""Compile the given pattern lines to an `re.Pattern` object.
The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with
its path separators and newlines swapped (e.g. '*\n*.py`). By using
newlines to separate path components, and not setting `re.DOTALL`, we
ensure that the `*` wildcard cannot match path separators.
The returned `re.Pattern` object may have its `match()` method called to
match a complete pattern, or `search()` to match from the right. The
argument supplied to these methods must also have its path separators and
newlines swapped.
"""

# Match the start of the path, or just after a path separator
parts = ['^']
for part in pattern_lines.splitlines(keepends=True):
# We slice off the common prefix and suffix added by translate() to
# ensure that re.DOTALL is not set, and the end of the string not
# matched, respectively. With DOTALL not set, '*' wildcards will not
# match path separators, because the '.' characters in the pattern
# will not match newlines.
parts.append(fnmatch.translate(part)[_FNMATCH_SLICE])
# Match the end of the path, always.
parts.append(r'\Z')
flags = re.MULTILINE
if not case_sensitive:
flags |= re.IGNORECASE
return re.compile(''.join(parts), flags=flags)


class _Selector:
"""A selector matches a specific glob pattern part against the children
of a given path."""
Expand Down Expand Up @@ -274,6 +323,10 @@ class PurePath(object):
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',

# The `_lines_cached` slot stores the string path with path separators
# and newlines swapped. This is used to implement `match()`.
'_lines_cached',

# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
Expand Down Expand Up @@ -439,6 +492,16 @@ def _parts_normcase(self):
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached

@property
def _lines(self):
# Path with separators and newlines swapped, for pattern matching.
try:
return self._lines_cached
except AttributeError:
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
self._lines_cached = str(self).translate(trans)
return self._lines_cached

def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
Expand Down Expand Up @@ -695,23 +758,18 @@ def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
"""
if not isinstance(path_pattern, PurePath):
path_pattern = self.with_segments(path_pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self._flavour)
pat = self.with_segments(path_pattern)
if not pat.parts:
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
if path_pattern.drive or path_pattern.root:
return pattern.match(self._lines) is not None
elif path_pattern._tail:
return pattern.search(self._lines) is not None
else:
raise ValueError("empty pattern")
pat_parts = pat.parts
parts = self.parts
if pat.drive or pat.root:
if len(pat_parts) != len(parts):
return False
elif len(pat_parts) > len(parts):
return False
for part, pat in zip(reversed(parts), reversed(pat_parts)):
match = _compile_pattern(pat, case_sensitive)
if not match(part):
return False
return True


# Can't subclass os.PathLike from PurePath and keep the constructor
# optimizations in PurePath.__slots__.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve performance of :meth:`pathlib.PurePath.match` by compiling an
:class:`re.Pattern` object for the entire pattern.

0 comments on commit b100cef

Please sign in to comment.