Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing of JSON index dist-info-metadata values #12078

Merged
merged 12 commits into from
Jun 27, 2023
1 change: 1 addition & 0 deletions news/12042.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Correctly parse ``dist-info-metadata`` values from JSON-format index data.
97 changes: 65 additions & 32 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,6 @@ class LinkHash:
def __post_init__(self) -> None:
assert self.name in _SUPPORTED_HASHES

@classmethod
def parse_pep658_hash(cls, dist_info_metadata: str) -> Optional["LinkHash"]:
"""Parse a PEP 658 data-dist-info-metadata hash."""
if dist_info_metadata == "true":
return None
name, sep, value = dist_info_metadata.partition("=")
if not sep:
return None
if name not in _SUPPORTED_HASHES:
return None
return cls(name=name, value=value)

@classmethod
@functools.lru_cache(maxsize=None)
def find_hash_url_fragment(cls, url: str) -> Optional["LinkHash"]:
Expand All @@ -107,6 +95,20 @@ def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool:
return hashes.is_hash_allowed(self.name, hex_digest=self.value)


@dataclass(frozen=True)
class MetadataFile:
"""Information about a core metadata file associated with a distribution."""

hashes: Optional[dict[str, str]]

# TODO: Do we care about stripping out unsupported hash methods?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong preference since the values in _SUPPORTED_HASHES look fine to me (though it might be time to retire md5 from that list). In theory people can use a number of other hashes that this method may not pick up, but I think that can/should be handled as a separate task of maintaining the list of supported hashes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems reasonable. This is mostly about handling garbage produced by incorrect servers (stuff like "sah256=abcbcba") than about limiting what hashes we use.

The attribute is already a weird tri-state thing (None = no data supplied, MetadataFile(None) = supplied with no hashes, and the full thing with hashes. There's also a dodgy MetadataFile({}) which we get if we end up stripping everything, which is probably like `MetadataFile(None), but consumers might not like an empty dict of hashes.

I think what I should probably do is validate the input up front, rather than in this class. Then we just assert here that hashes must be None or a dictionary containing only one or more supported hash types.

def __init__(self, hashes: Optional[dict[str, str]]):
if hashes:
hashes = {n: v for n, v in hashes.items() if n in _SUPPORTED_HASHES}
# We need to use this as this is a frozen dataclass
object.__setattr__(self, "hashes", hashes)


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
Expand Down Expand Up @@ -179,7 +181,7 @@ class Link(KeyBasedCompareMixin):
"comes_from",
"requires_python",
"yanked_reason",
"dist_info_metadata",
"metadata_file_data",
"cache_link_parsing",
"egg_fragment",
]
Expand All @@ -190,7 +192,7 @@ def __init__(
comes_from: Optional[Union[str, "IndexContent"]] = None,
requires_python: Optional[str] = None,
yanked_reason: Optional[str] = None,
dist_info_metadata: Optional[str] = None,
metadata_file_data: Optional[MetadataFile] = None,
cache_link_parsing: bool = True,
hashes: Optional[Mapping[str, str]] = None,
) -> None:
Expand All @@ -208,18 +210,21 @@ def __init__(
a simple repository HTML link. If the file has been yanked but
no reason was provided, this should be the empty string. See
PEP 592 for more information and the specification.
:param dist_info_metadata: the metadata attached to the file, or None if no such
metadata is provided. This is the value of the "data-dist-info-metadata"
attribute, if present, in a simple repository HTML link. This may be parsed
into its own `Link` by `self.metadata_link()`. See PEP 658 for more
information and the specification.
:param metadata_file_data: the metadata attached to the file, or None if
no such metadata is provided. This argument, if not None, indicates
that a separate metadata file exists, and also optionally supplies
hashes for that file.
:param cache_link_parsing: A flag that is used elsewhere to determine
whether resources retrieved from this link should be cached. PyPI
URLs should generally have this set to False, for example.
:param hashes: A mapping of hash names to digests to allow us to
determine the validity of a download.
"""

# The comes_from, requires_python, and metadata_file_data arguments are
# only used by classmethods of this class, and are not used in client
# code directly.

# url can be a UNC windows share
if url.startswith("\\\\"):
url = path_to_url(url)
Expand All @@ -239,7 +244,7 @@ def __init__(
self.comes_from = comes_from
self.requires_python = requires_python if requires_python else None
self.yanked_reason = yanked_reason
self.dist_info_metadata = dist_info_metadata
self.metadata_file_data = metadata_file_data

super().__init__(key=url, defining_class=Link)

Expand All @@ -262,9 +267,20 @@ def from_json(
url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
dist_info_metadata = file_data.get("dist-info-metadata")
hashes = file_data.get("hashes", {})

# The dist-info-metadata value may be a boolean, or a dict of hashes.
metadata_info = file_data.get("dist-info-metadata", False)
if isinstance(metadata_info, dict):
# The file exists, and hashes have been supplied
metadata_file_data = MetadataFile(metadata_info)
elif metadata_info:
# The file exists, but there are no hashes
metadata_file_data = MetadataFile(None)
else:
# The file does not exist
metadata_file_data = None

# The Link.yanked_reason expects an empty string instead of a boolean.
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
Expand All @@ -278,7 +294,7 @@ def from_json(
requires_python=pyrequire,
yanked_reason=yanked_reason,
hashes=hashes,
dist_info_metadata=dist_info_metadata,
metadata_file_data=metadata_file_data,
)

@classmethod
Expand All @@ -298,14 +314,35 @@ def from_element(
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")
dist_info_metadata = anchor_attribs.get("data-dist-info-metadata")

# The dist-info-metadata value may be the string "true", or a string of
# the form "hashname=hashval"
metadata_info = anchor_attribs.get("data-dist-info-metadata")
if metadata_info == "true":
# The file exists, but there are no hashes
metadata_file_data = MetadataFile(None)
elif metadata_info is None:
# The file does not exist
metadata_file_data = None
else:
# The file exists, and hashes have been supplied
hashname, sep, hashval = metadata_info.partition("=")
if sep == "=":
metadata_file_data = MetadataFile({hashname: hashval})
else:
# Error - data is wrong. Treat as no hashes supplied.
logger.debug(
"Index returned invalid data-dist-info-metadata value: %s",
metadata_info,
)
metadata_file_data = MetadataFile(None)

return cls(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
dist_info_metadata=dist_info_metadata,
metadata_file_data=metadata_file_data,
)

def __str__(self) -> str:
Expand Down Expand Up @@ -407,17 +444,13 @@ def subdirectory_fragment(self) -> Optional[str]:
return match.group(1)

def metadata_link(self) -> Optional["Link"]:
"""Implementation of PEP 658 parsing."""
# Note that Link.from_element() parsing the "data-dist-info-metadata" attribute
# from an HTML anchor tag is typically how the Link.dist_info_metadata attribute
# gets set.
if self.dist_info_metadata is None:
"""Return a link to the associated core metadata file (if any)."""
if self.metadata_file_data is None:
return None
metadata_url = f"{self.url_without_fragment}.metadata"
metadata_link_hash = LinkHash.parse_pep658_hash(self.dist_info_metadata)
if metadata_link_hash is None:
if self.metadata_file_data.hashes is None:
return Link(metadata_url)
return Link(metadata_url, hashes=metadata_link_hash.as_dict())
return Link(metadata_url, hashes=self.metadata_file_data.hashes)

def as_hashes(self) -> Hashes:
return Hashes({k: [v] for k, v in self._hashes.items()})
Expand Down
37 changes: 24 additions & 13 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pip._internal.models.link import (
Link,
LinkHash,
MetadataFile,
_clean_url_path,
_ensure_quoted_url,
)
Expand Down Expand Up @@ -527,7 +528,7 @@ def test_parse_links_json() -> None:
requires_python=">=3.7",
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
dist_info_metadata="sha512=aabdd41",
metadata_file_data=MetadataFile({"sha512": "aabdd41"}),
),
]

Expand Down Expand Up @@ -603,12 +604,12 @@ def test_parse_links__yanked_reason(anchor_html: str, expected: Optional[str]) -
),
],
)
def test_parse_links__dist_info_metadata(
def test_parse_links__metadata_file_data(
anchor_html: str,
expected: Optional[str],
hashes: Dict[str, str],
) -> None:
link = _test_parse_links_data_attribute(anchor_html, "dist_info_metadata", expected)
link = _test_parse_links_data_attribute(anchor_html, "metadata_file_data", expected)
assert link._hashes == hashes


Expand Down Expand Up @@ -1080,17 +1081,27 @@ def test_link_hash_parsing(url: str, result: Optional[LinkHash]) -> None:


@pytest.mark.parametrize(
"dist_info_metadata, result",
"metadata_attrib, expected",
[
("sha256=aa113592bbe", LinkHash("sha256", "aa113592bbe")),
("sha256=", LinkHash("sha256", "")),
("sha500=aa113592bbe", None),
("true", None),
("", None),
("aa113592bbe", None),
("sha256=aa113592bbe", MetadataFile({"sha256": "aa113592bbe"})),
("sha256=", MetadataFile({"sha256": ""})),
("sha500=aa113592bbe", MetadataFile({})),
("true", MetadataFile(None)),
(None, None),
# TODO: Are these correct?
("", MetadataFile(None)),
("aa113592bbe", MetadataFile(None)),
],
)
def test_pep658_hash_parsing(
dist_info_metadata: str, result: Optional[LinkHash]
def test_metadata_file_info_parsing_html(
metadata_attrib: str, expected: Optional[MetadataFile]
) -> None:
assert LinkHash.parse_pep658_hash(dist_info_metadata) == result
attribs: Dict[str, Optional[str]] = {
"href": "something",
"data-dist-info-metadata": metadata_attrib,
}
page_url = "dummy_for_comes_from"
base_url = "https://index.url/simple"
link = Link.from_element(attribs, page_url, base_url)
assert link is not None and link.metadata_file_data == expected
# TODO: Do we need to do something for the JSON data?