Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support xPath for Windows pathnames #5310

Merged
merged 12 commits into from
Nov 30, 2022
66 changes: 34 additions & 32 deletions src/datasets/download/streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,21 +332,6 @@ def xrelpath(path, start=None):
return posixpath.relpath(main_hop, start=str(start).split("::")[0]) if start else os.path.relpath(main_hop)


def _as_posix(path: Path):
"""Extend :meth:`pathlib.PurePath.as_posix` to fix missing slashes after protocol.

Args:
path (:obj:`~pathlib.Path`): Calling Path instance.

Returns:
obj:`str`
"""
path_as_posix = path.as_posix()
path_as_posix = SINGLE_SLASH_AFTER_PROTOCOL_PATTERN.sub("://", path_as_posix)
path_as_posix += "//" if path_as_posix.endswith(":") else "" # Add slashes to root of the protocol
return path_as_posix


def _add_retries_to_file_obj_read_method(file_obj):
read = file_obj.read
max_retries = config.STREAMING_READ_MAX_RETRIES
Expand Down Expand Up @@ -449,15 +434,14 @@ def xopen(file: str, mode="r", *args, use_auth_token: Optional[Union[str, bool]]
Returns:
file object
"""
main_hop, *rest_hops = str(file).split("::")
# This works as well for `xopen(str(Path(...)))`
file_str = _as_str(file)
main_hop, *rest_hops = file_str.split("::")
if is_local_path(main_hop):
return open(main_hop, mode, *args, **kwargs)
# required for `xopen(str(Path(...)))` to work
file = _as_posix(Path(file))
main_hop, *rest_hops = file.split("::")
# add headers and cookies for authentication on the HF Hub and for Google Drive
if not rest_hops and (main_hop.startswith("http://") or main_hop.startswith("https://")):
file, new_kwargs = _prepare_http_url_kwargs(file, use_auth_token=use_auth_token)
file, new_kwargs = _prepare_http_url_kwargs(file_str, use_auth_token=use_auth_token)
elif rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")):
url = rest_hops[0]
url, http_kwargs = _prepare_http_url_kwargs(url, use_auth_token=use_auth_token)
Expand Down Expand Up @@ -491,7 +475,7 @@ def xlistdir(path: str, use_auth_token: Optional[Union[str, bool]] = None) -> Li
Returns:
`list` of `str`
"""
main_hop, *rest_hops = str(path).split("::")
main_hop, *rest_hops = _as_str(path).split("::")
if is_local_path(main_hop):
return os.listdir(path)
else:
Expand Down Expand Up @@ -526,7 +510,7 @@ def xglob(urlpath, *, recursive=False, use_auth_token: Optional[Union[str, bool]
Returns:
`list` of `str`
"""
main_hop, *rest_hops = str(urlpath).split("::")
main_hop, *rest_hops = _as_str(urlpath).split("::")
if is_local_path(main_hop):
return glob.glob(main_hop, recursive=recursive)
else:
Expand Down Expand Up @@ -562,7 +546,7 @@ def xwalk(urlpath, use_auth_token: Optional[Union[str, bool]] = None):
Yields:
`tuple`: 3-tuple (dirpath, dirnames, filenames).
"""
main_hop, *rest_hops = str(urlpath).split("::")
main_hop, *rest_hops = _as_str(urlpath).split("::")
if is_local_path(main_hop):
yield from os.walk(main_hop)
else:
Expand All @@ -586,6 +570,18 @@ def xwalk(urlpath, use_auth_token: Optional[Union[str, bool]] = None):


class xPath(type(Path())):
"""Extension of `pathlib.Path` to support both local paths and remote URLs."""

def __str__(self):
path_str = super().__str__()
main_hop, *rest_hops = path_str.split("::")
if is_local_path(main_hop):
return main_hop
path_as_posix = path_str.replace("\\", "/")
path_as_posix = SINGLE_SLASH_AFTER_PROTOCOL_PATTERN.sub("://", path_as_posix)
path_as_posix += "//" if path_as_posix.endswith(":") else "" # Add slashes to root of the protocol
return path_as_posix

def glob(self, pattern, use_auth_token: Optional[Union[str, bool]] = None):
"""Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

Expand All @@ -597,7 +593,7 @@ def glob(self, pattern, use_auth_token: Optional[Union[str, bool]] = None):
Yields:
[`xPath`]
"""
posix_path = _as_posix(self)
posix_path = self.as_posix()
main_hop, *rest_hops = posix_path.split("::")
if is_local_path(main_hop):
yield from Path(main_hop).glob(pattern)
Expand Down Expand Up @@ -637,7 +633,7 @@ def parent(self) -> "xPath":
Returns:
[`xPath`]
"""
return type(self)(xdirname(_as_posix(self)))
return type(self)(xdirname(self.as_posix()))

@property
def name(self) -> str:
Expand All @@ -646,7 +642,7 @@ def name(self) -> str:
Returns:
`str`
"""
return PurePosixPath(_as_posix(self).split("::")[0]).name
return PurePosixPath(self.as_posix().split("::")[0]).name

@property
def stem(self) -> str:
Expand All @@ -655,7 +651,7 @@ def stem(self) -> str:
Returns:
`str`
"""
return PurePosixPath(_as_posix(self).split("::")[0]).stem
return PurePosixPath(self.as_posix().split("::")[0]).stem

@property
def suffix(self) -> str:
Expand All @@ -664,7 +660,7 @@ def suffix(self) -> str:
Returns:
`str`
"""
return PurePosixPath(_as_posix(self).split("::")[0]).suffix
return PurePosixPath(self.as_posix().split("::")[0]).suffix

def open(self, *args, **kwargs):
"""Extend :func:`xopen` to support argument of type :obj:`~pathlib.Path`.
Expand All @@ -676,7 +672,7 @@ def open(self, *args, **kwargs):
Returns:
`io.FileIO`: File-like object.
"""
return xopen(_as_posix(self), *args, **kwargs)
return xopen(str(self), *args, **kwargs)

def joinpath(self, *p: Tuple[str, ...]) -> "xPath":
"""Extend :func:`xjoin` to support argument of type :obj:`~pathlib.Path`.
Expand All @@ -687,14 +683,20 @@ def joinpath(self, *p: Tuple[str, ...]) -> "xPath":
Returns:
[`xPath`]
"""
return type(self)(xjoin(_as_posix(self), *p))
return type(self)(xjoin(self.as_posix(), *p))

def __truediv__(self, p: str) -> "xPath":
return self.joinpath(p)

def with_suffix(self, suffix):
main_hop, *rest_hops = _as_posix(self).split("::")
return type(self)("::".join([_as_posix(PurePosixPath(main_hop).with_suffix(suffix))] + rest_hops))
main_hop, *rest_hops = str(self).split("::")
if is_local_path(main_hop):
return type(self)(str(super().with_suffix(suffix)))
return type(self)("::".join([type(self)(PurePosixPath(main_hop).with_suffix(suffix)).as_posix()] + rest_hops))


def _as_str(path: Union[str, Path, xPath]):
return str(path) if isinstance(path, xPath) else str(xPath(str(path)))


def xgzip_open(filepath_or_buffer, *args, use_auth_token: Optional[Union[str, bool]] = None, **kwargs):
Expand Down
Loading