Skip to content

Commit

Permalink
pythonGH-101362 - Optimise pathlib by deferring path normalisation
Browse files Browse the repository at this point in the history
`PurePath` now normalises and splits paths only when necessary, e.g. when
`.name` or `.parent` is accessed. The result is cached. This speeds up path
object construction by around 4x.

`PurePath.__fspath__()` now returns an unnormalised path, which should be
transparent to filesystem APIs (else pathlib's normalisation is broken!).
This extends the earlier performance improvement to most impure `Path`
methods, and also speeds up pickling, `p.joinpath('bar')` and `p / 'bar'`.

This also fixes pythonGH-76846 and pythonGH-85281 by unifying path constructors and
adding an `__init__()` method.
  • Loading branch information
barneygale committed Feb 4, 2023
1 parent 144aaa7 commit b428069
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 89 deletions.
170 changes: 82 additions & 88 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ class _PathParents(Sequence):
def __init__(self, path):
# We don't store the instance to avoid reference cycles
self._pathcls = type(path)
self._drv = path._drv
self._root = path._root
self._drv = path.drive
self._root = path.root
self._parts = path._parts

def __len__(self):
Expand Down Expand Up @@ -251,33 +251,45 @@ class PurePath(object):
directly, regardless of your system.
"""
__slots__ = (
'_drv', '_root', '_parts',
'_fspath', '_drv', '_root', '_parts_cached',
'_str', '_hash', '_parts_tuple', '_parts_normcase_cached',
)
_flavour = os.path

def __new__(cls, *args):
def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing
PurePath objects. The strings and path objects are combined so as
to yield a canonicalized path, which is incorporated into the
new PurePath object.
"""
if cls is PurePath:
cls = PureWindowsPath if os.name == 'nt' else PurePosixPath
return cls._from_parts(args)
return super().__new__(cls)

def __reduce__(self):
# Using the parts tuple helps share interned path parts
# when pickling related paths.
return (self.__class__, tuple(self._parts))
return (self.__class__, (self._fspath,))

def __init__(self, *args):
if not args:
path = ''
elif len(args) == 1:
path = os.fspath(args[0])
else:
path = self._flavour.join(*args)

if not isinstance(path, str):
raise TypeError(
"argument should be a str object or an os.PathLike "
"object returning str, not %r"
% type(path))
self._fspath = path

@classmethod
def _parse_parts(cls, parts):
if not parts:
def _parse_path(cls, path):
if not path:
return '', '', []
sep = cls._flavour.sep
altsep = cls._flavour.altsep
path = cls._flavour.join(*parts)
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls._flavour.splitroot(path)
Expand All @@ -288,43 +300,18 @@ def _parse_parts(cls, parts):
parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
return drv, root, parsed

@classmethod
def _parse_args(cls, args):
# This is useful when you don't want to create an instance, just
# canonicalize some constructor arguments.
parts = []
for a in args:
if isinstance(a, PurePath):
parts += a._parts
else:
a = os.fspath(a)
if isinstance(a, str):
# Force-cast str subclasses to str (issue #21127)
parts.append(str(a))
else:
raise TypeError(
"argument should be a str object or an os.PathLike "
"object returning str, not %r"
% type(a))
return cls._parse_parts(parts)

@classmethod
def _from_parts(cls, args):
# We need to call _parse_args on the instance, so as to get the
# right flavour.
self = object.__new__(cls)
drv, root, parts = self._parse_args(args)
def _load_parts(self):
drv, root, parts = self._parse_path(self._fspath)
self._drv = drv
self._root = root
self._parts = parts
return self
self._parts_cached = parts

@classmethod
def _from_parsed_parts(cls, drv, root, parts):
self = object.__new__(cls)
self = cls(cls._format_parsed_parts(drv, root, parts))
self._drv = drv
self._root = root
self._parts = parts
self._parts_cached = parts
return self

@classmethod
Expand All @@ -340,12 +327,12 @@ def __str__(self):
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self._drv, self._root,
self._str = self._format_parsed_parts(self.drive, self.root,
self._parts) or '.'
return self._str

def __fspath__(self):
return str(self)
return self._fspath or '.'

def as_posix(self):
"""Return the string representation of the path with forward (/)
Expand All @@ -356,7 +343,7 @@ def as_posix(self):
def __bytes__(self):
"""Return the bytes representation of the path. This is only
recommended to use under Unix."""
return os.fsencode(self)
return os.fsencode(str(self))

def __repr__(self):
return "{}({!r})".format(self.__class__.__name__, self.as_posix())
Expand All @@ -366,7 +353,7 @@ def as_uri(self):
if not self.is_absolute():
raise ValueError("relative path can't be expressed as a file URI")

drive = self._drv
drive = self.drive
if len(drive) == 2 and drive[1] == ':':
# It's a path on a local drive => 'file:///c:/a/b'
prefix = 'file:///' + drive
Expand Down Expand Up @@ -422,23 +409,43 @@ def __ge__(self, other):
return NotImplemented
return self._parts_normcase >= other._parts_normcase

drive = property(attrgetter('_drv'),
doc="""The drive prefix (letter or UNC path), if any.""")
@property
def drive(self):
"""The drive prefix (letter or UNC path), if any."""
try:
return self._drv
except AttributeError:
self._load_parts()
return self._drv

root = property(attrgetter('_root'),
doc="""The root of the path, if any.""")
@property
def root(self):
"""The root of the path, if any."""
try:
return self._root
except AttributeError:
self._load_parts()
return self._root

@property
def _parts(self):
try:
return self._parts_cached
except AttributeError:
self._load_parts()
return self._parts_cached

@property
def anchor(self):
"""The concatenation of the drive and root, or ''."""
anchor = self._drv + self._root
anchor = self.drive + self.root
return anchor

@property
def name(self):
"""The final path component, if any."""
parts = self._parts
if len(parts) == (1 if (self._drv or self._root) else 0):
if len(parts) == (1 if (self.drive or self.root) else 0):
return ''
return parts[-1]

Expand Down Expand Up @@ -487,7 +494,7 @@ def with_name(self, name):
drv, root, tail = f.splitroot(name)
if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail):
raise ValueError("Invalid name %r" % (name))
return self._from_parsed_parts(self._drv, self._root,
return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name])

def with_stem(self, stem):
Expand All @@ -512,7 +519,7 @@ def with_suffix(self, suffix):
name = name + suffix
else:
name = name[:-len(old_suffix)] + suffix
return self._from_parsed_parts(self._drv, self._root,
return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name])

def relative_to(self, other, /, *_deprecated, walk_up=False):
Expand Down Expand Up @@ -571,22 +578,7 @@ def joinpath(self, *args):
paths) or a totally different path (if one of the arguments is
anchored).
"""
drv1, root1, parts1 = self._drv, self._root, self._parts
drv2, root2, parts2 = self._parse_args(args)
if root2:
if not drv2 and drv1:
return self._from_parsed_parts(drv1, root2, [drv1 + root2] + parts2[1:])
else:
return self._from_parsed_parts(drv2, root2, parts2)
elif drv2:
if drv2 == drv1 or self._flavour.normcase(drv2) == self._flavour.normcase(drv1):
# Same drive => second path is relative to the first.
return self._from_parsed_parts(drv1, root1, parts1 + parts2[1:])
else:
return self._from_parsed_parts(drv2, root2, parts2)
else:
# Second path is non-anchored (common case).
return self._from_parsed_parts(drv1, root1, parts1 + parts2)
return type(self)(self, *args)

def __truediv__(self, key):
try:
Expand All @@ -596,15 +588,15 @@ def __truediv__(self, key):

def __rtruediv__(self, key):
try:
return self._from_parts([key] + self._parts)
return type(self)(key, self)
except TypeError:
return NotImplemented

@property
def parent(self):
"""The logical parent of the path."""
drv = self._drv
root = self._root
drv = self.drive
root = self.root
parts = self._parts
if len(parts) == 1 and (drv or root):
return self
Expand All @@ -620,7 +612,7 @@ def is_absolute(self):
a drive)."""
# ntpath.isabs() is defective - see GH-44626 .
if self._flavour is ntpath:
return bool(self._drv and self._root)
return bool(self.drive and self.root)
return self._flavour.isabs(self)

def is_reserved(self):
Expand All @@ -644,12 +636,12 @@ def match(self, path_pattern):
Return True if this path matches the given pattern.
"""
path_pattern = self._flavour.normcase(path_pattern)
drv, root, pat_parts = self._parse_parts((path_pattern,))
drv, root, pat_parts = self._parse_path(path_pattern)
if not pat_parts:
raise ValueError("empty pattern")
elif drv and drv != self._flavour.normcase(self._drv):
elif drv and drv != self._flavour.normcase(self.drive):
return False
elif root and root != self._root:
elif root and root != self.root:
return False
parts = self._parts_normcase
if drv or root:
Expand Down Expand Up @@ -702,24 +694,26 @@ class Path(PurePath):
"""
__slots__ = ()

def __new__(cls, *args, **kwargs):
def __init__(self, *args, **kwargs):
if kwargs:
msg = ("support for supplying keyword arguments to pathlib.PurePath "
"is deprecated and scheduled for removal in Python {remove}")
warnings._deprecated("pathlib.PurePath(**kwargs)", msg, remove=(3, 14))
super().__init__(*args)

def __new__(cls, *args, **kwargs):
if cls is Path:
cls = WindowsPath if os.name == 'nt' else PosixPath
self = cls._from_parts(args)
if self._flavour is not os.path:
elif cls._flavour is not os.path:
raise NotImplementedError("cannot instantiate %r on your system"
% (cls.__name__,))
return self
return super().__new__(cls)

def _make_child_relpath(self, part):
# This is an optimization used for dir walking. `part` must be
# a single part relative to this path.
parts = self._parts + [part]
return self._from_parsed_parts(self._drv, self._root, parts)
return self._from_parsed_parts(self.drive, self.root, parts)

def __enter__(self):
# In previous versions of pathlib, __exit__() marked this path as
Expand Down Expand Up @@ -789,7 +783,7 @@ def glob(self, pattern):
sys.audit("pathlib.Path.glob", self, pattern)
if not pattern:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
drv, root, pattern_parts = self._parse_parts((pattern,))
drv, root, pattern_parts = self._parse_path(pattern)
if drv or root:
raise NotImplementedError("Non-relative patterns are unsupported")
if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
Expand All @@ -804,7 +798,7 @@ def rglob(self, pattern):
this subtree.
"""
sys.audit("pathlib.Path.rglob", self, pattern)
drv, root, pattern_parts = self._parse_parts((pattern,))
drv, root, pattern_parts = self._parse_path(pattern)
if drv or root:
raise NotImplementedError("Non-relative patterns are unsupported")
if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep):
Expand All @@ -821,7 +815,7 @@ def absolute(self):
"""
if self.is_absolute():
return self
return self._from_parts([os.getcwd()] + self._parts)
return type(self)(os.getcwd(), *self._parts)

def resolve(self, strict=False):
"""
Expand All @@ -839,7 +833,7 @@ def check_eloop(e):
except OSError as e:
check_eloop(e)
raise
p = self._from_parts((s,))
p = type(self)(s)

# In non-strict mode, realpath() doesn't raise on symlink loops.
# Ensure we get an exception by calling stat()
Expand Down Expand Up @@ -929,7 +923,7 @@ def readlink(self):
"""
if not hasattr(os, "readlink"):
raise NotImplementedError("os.readlink() not available on this system")
return self._from_parts((os.readlink(self),))
return type(self)(os.readlink(self))

def touch(self, mode=0o666, exist_ok=True):
"""
Expand Down Expand Up @@ -1198,12 +1192,12 @@ def expanduser(self):
""" Return a new path with expanded ~ and ~user constructs
(as returned by os.path.expanduser)
"""
if (not (self._drv or self._root) and
if (not (self.drive or self.root) and
self._parts and self._parts[0][:1] == '~'):
homedir = self._flavour.expanduser(self._parts[0])
if homedir[:1] == "~":
raise RuntimeError("Could not determine home directory.")
return self._from_parts([homedir] + self._parts[1:])
return type(self)(homedir, *self._parts[1:])

return self

Expand Down
4 changes: 3 additions & 1 deletion Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
class _BaseFlavourTest(object):

def _check_parse_parts(self, arg, expected):
f = self.cls._parse_parts
def f(parts):
path = self.flavour.join(*parts) if parts else ''
return self.cls._parse_path(path)
sep = self.flavour.sep
altsep = self.flavour.altsep
actual = f([x.replace('/', sep) for x in arg])
Expand Down

0 comments on commit b428069

Please sign in to comment.