Skip to content

Commit

Permalink
feat: simplify object path split (#1028)
Browse files Browse the repository at this point in the history
* simplify object path split

* add example from #975

* fix tests

* add more test cases

* test case update

* remove scheme unused regex
  • Loading branch information
lobis committed Dec 12, 2023
1 parent 81995ab commit 011db53
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 73 deletions.
68 changes: 14 additions & 54 deletions src/uproot/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import itertools
import numbers
import os
import pathlib
import platform
import re
import warnings
Expand Down Expand Up @@ -290,14 +289,10 @@ def regularize_path(path):
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")

# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
_remote_schemes = ["root", "s3", "http", "https"]
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})

_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://")
_uri_scheme_chain = re.compile(
"^(" + "|".join([re.escape(x) for x in _schemes]) + ")::"
)


def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
"""
Expand All @@ -313,54 +308,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
"""

urlpath: str = regularize_path(urlpath).strip()
path = urlpath

def _split_path(path: str) -> list[str]:
parts = path.split(":")
if pathlib.PureWindowsPath(path).drive:
# Windows absolute path
assert len(parts) >= 2, f"could not split object from windows path {path}"
parts = [parts[0] + ":" + parts[1]] + parts[2:]
return parts

if "://" not in path:
path = "file://" + path

# replace the match of _uri_scheme_chain with "" until there is no match
while _uri_scheme_chain.match(path):
path = _uri_scheme_chain.sub("", path)

if _uri_scheme.match(path):
# if not a local path, attempt to match a URI scheme
if path.startswith("file://"):
parsed_url_path = path[7:]
else:
parsed_url_path = urlparse(path).path

if parsed_url_path.startswith("//"):
parsed_url_path = parsed_url_path[2:]

parts = _split_path(parsed_url_path)
else:
# invalid scheme
scheme = path.split("://")[0]
raise ValueError(
f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}."
)

if len(parts) == 1:
obj = None
elif len(parts) == 2:
obj = parts[1]
# remove the object from the path (including the colon)
urlpath = urlpath[: -len(obj) - 1]
# clean badly placed slashes
obj = obj.strip().lstrip("/")
while "//" in obj:
obj = obj.replace("//", "/")
else:
raise ValueError(f"could not split object from path {path}")

obj = None

separator = "::"
parts = urlpath.split(separator)
object_regex = re.compile(r"(.+\.root):(.*$)")
for i, part in enumerate(reversed(parts)):
match = object_regex.match(part)
if match:
obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/")
parts[-i - 1] = match.group(1)
break

urlpath = separator.join(parts)
return urlpath, obj


Expand Down
8 changes: 4 additions & 4 deletions tests/test_0001_source_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,13 @@ def test_colons_and_ports():
"https://example.com:443",
None,
)
assert uproot._util.file_object_path_split("https://example.com:443/something") == (
"https://example.com:443/something",
assert uproot._util.file_object_path_split("https://example.com:443/file.root") == (
"https://example.com:443/file.root",
None,
)
assert uproot._util.file_object_path_split(
"https://example.com:443/something:else"
) == ("https://example.com:443/something", "else")
"https://example.com:443/file.root:object"
) == ("https://example.com:443/file.root", "object")


@pytest.mark.parametrize("use_threads", [True, False], indirect=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_0692_fsspec_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_fsspec_zip(tmp_path):

# open with fsspec
with uproot.open(
f"zip://{filename}::file://{filename_zip}:Events/MET_pt"
f"zip://{filename}:Events/MET_pt::file://{filename_zip}"
) as branch:
data = branch.array(library="np")
assert len(data) == 40
Expand Down
74 changes: 60 additions & 14 deletions tests/test_0976_path_object_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,24 +64,38 @@
),
),
(
"ssh://user@host:22/path/to/file:object",
"ssh://user@host:22/path/to/file.root:/object//path",
(
"ssh://user@host:22/path/to/file",
"object",
"ssh://user@host:22/path/to/file.root",
"object/path",
),
),
(
"ssh://user@host:50230/path/to/file",
"ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/",
(
"ssh://user@host:50230/path/to/file",
"ssh://user@host:22/path/to/file.root",
"object/path:with:colon:in:path/something",
),
),
(
"ssh://user@host:50230/path/to/file.root",
(
"ssh://user@host:50230/path/to/file.root",
None,
),
),
(
"s3://bucket/path/to/file:object",
"s3://bucket/path/to/file.root:/dir////object",
(
"s3://bucket/path/to/file.root",
"dir/object",
),
),
(
"s3://bucket/path/to/file.root:",
(
"s3://bucket/path/to/file",
"object",
"s3://bucket/path/to/file.root",
"",
),
),
(
Expand All @@ -98,27 +112,56 @@
None,
),
),
# https://github.com/scikit-hep/uproot5/issues/975
(
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree",
(
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root",
"RNT:CollectionTree",
),
),
(
"zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
(
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
"simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
(
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt",
r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
(
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
"zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root",
(
"zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root",
"Events/MET_pt",
),
),
(
"/some/weird/path:with:colons/file.root:Events/MET_pt",
(
"/some/weird/path:with:colons/file.root",
"Events/MET_pt",
),
),
(
"/some/weird/path:with:colons/file.root",
(
"/some/weird/path:with:colons/file.root",
None,
),
),
],
)
def test_url_split(input_value, expected_output):
Expand All @@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output):
@pytest.mark.parametrize(
"input_value",
[
"local/file.root://Events",
"local/file.root.zip://Events",
"local/file.roo://Events",
"local/file://Events",
],
)
def test_url_split_invalid(input_value):
with pytest.raises(ValueError):
uproot._util.file_object_path_split(input_value)
path, obj = uproot._util.file_object_path_split(input_value)
assert obj is None
assert path == input_value

0 comments on commit 011db53

Please sign in to comment.