Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing of JSON index dist-info-metadata values #12078

Merged
merged 12 commits into from
Jun 27, 2023
1 change: 1 addition & 0 deletions news/12042.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Correctly parse ``dist-info-metadata`` values from JSON-format index data.
114 changes: 82 additions & 32 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,6 @@ class LinkHash:
def __post_init__(self) -> None:
assert self.name in _SUPPORTED_HASHES

@classmethod
def parse_pep658_hash(cls, dist_info_metadata: str) -> Optional["LinkHash"]:
"""Parse a PEP 658 data-dist-info-metadata hash."""
if dist_info_metadata == "true":
return None
name, sep, value = dist_info_metadata.partition("=")
if not sep:
return None
if name not in _SUPPORTED_HASHES:
return None
return cls(name=name, value=value)

@classmethod
@functools.lru_cache(maxsize=None)
def find_hash_url_fragment(cls, url: str) -> Optional["LinkHash"]:
Expand All @@ -107,6 +95,28 @@ def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool:
return hashes.is_hash_allowed(self.name, hex_digest=self.value)


@dataclass(frozen=True)
class MetadataFile:
"""Information about a core metadata file associated with a distribution."""

hashes: Optional[Dict[str, str]]

def __post_init__(self) -> None:
if self.hashes is not None:
assert all(name in _SUPPORTED_HASHES for name in self.hashes)


def supported_hashes(hashes: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
# Remove any unsupported hash types from the mapping. If this leaves no
# supported hashes, return None
if hashes is None:
return None
hashes = {n: v for n, v in hashes.items() if n in _SUPPORTED_HASHES}
if len(hashes) > 0:
return hashes
return None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
if len(hashes) > 0:
return hashes
return None
if not hashes:
return None
return hashes

Sightly less C-like, and easier to read. :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, thanks (commit incoming once I get pre-commit to behave).



def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
Expand Down Expand Up @@ -179,7 +189,7 @@ class Link(KeyBasedCompareMixin):
"comes_from",
"requires_python",
"yanked_reason",
"dist_info_metadata",
"metadata_file_data",
"cache_link_parsing",
"egg_fragment",
]
Expand All @@ -190,7 +200,7 @@ def __init__(
comes_from: Optional[Union[str, "IndexContent"]] = None,
requires_python: Optional[str] = None,
yanked_reason: Optional[str] = None,
dist_info_metadata: Optional[str] = None,
metadata_file_data: Optional[MetadataFile] = None,
cache_link_parsing: bool = True,
hashes: Optional[Mapping[str, str]] = None,
) -> None:
Expand All @@ -208,18 +218,21 @@ def __init__(
a simple repository HTML link. If the file has been yanked but
no reason was provided, this should be the empty string. See
PEP 592 for more information and the specification.
:param dist_info_metadata: the metadata attached to the file, or None if no such
metadata is provided. This is the value of the "data-dist-info-metadata"
attribute, if present, in a simple repository HTML link. This may be parsed
into its own `Link` by `self.metadata_link()`. See PEP 658 for more
information and the specification.
:param metadata_file_data: the metadata attached to the file, or None if
no such metadata is provided. This argument, if not None, indicates
that a separate metadata file exists, and also optionally supplies
hashes for that file.
:param cache_link_parsing: A flag that is used elsewhere to determine
whether resources retrieved from this link should be cached. PyPI
URLs should generally have this set to False, for example.
:param hashes: A mapping of hash names to digests to allow us to
determine the validity of a download.
"""

# The comes_from, requires_python, and metadata_file_data arguments are
# only used by classmethods of this class, and are not used in client
# code directly.

# url can be a UNC windows share
if url.startswith("\\\\"):
url = path_to_url(url)
Expand All @@ -239,7 +252,7 @@ def __init__(
self.comes_from = comes_from
self.requires_python = requires_python if requires_python else None
self.yanked_reason = yanked_reason
self.dist_info_metadata = dist_info_metadata
self.metadata_file_data = metadata_file_data

super().__init__(key=url, defining_class=Link)

Expand All @@ -262,9 +275,25 @@ def from_json(
url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
dist_info_metadata = file_data.get("dist-info-metadata")
hashes = file_data.get("hashes", {})

# PEP 714: Indexes must use the name core-metadata, but
# clients should support the old name as a fallback for compatibility.
metadata_info = file_data.get("core-metadata")
if metadata_info is None:
metadata_info = file_data.get("dist-info-metadata")

# The metadata info value may be a boolean, or a dict of hashes.
if isinstance(metadata_info, dict):
# The file exists, and hashes have been supplied
metadata_file_data = MetadataFile(supported_hashes(metadata_info))
elif metadata_info:
# The file exists, but there are no hashes
metadata_file_data = MetadataFile(None)
else:
# False or not present: the file does not exist
metadata_file_data = None

# The Link.yanked_reason expects an empty string instead of a boolean.
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
Expand All @@ -278,7 +307,7 @@ def from_json(
requires_python=pyrequire,
yanked_reason=yanked_reason,
hashes=hashes,
dist_info_metadata=dist_info_metadata,
metadata_file_data=metadata_file_data,
)

@classmethod
Expand All @@ -298,14 +327,39 @@ def from_element(
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")
dist_info_metadata = anchor_attribs.get("data-dist-info-metadata")

# PEP 714: Indexes must use the name data-core-metadata, but
# clients should support the old name as a fallback for compatibility.
metadata_info = anchor_attribs.get("data-core-metadata")
if metadata_info is None:
metadata_info = anchor_attribs.get("data-dist-info-metadata")
# The metadata info value may be the string "true", or a string of
# the form "hashname=hashval"
if metadata_info == "true":
# The file exists, but there are no hashes
metadata_file_data = MetadataFile(None)
elif metadata_info is None:
# The file does not exist
metadata_file_data = None
else:
# The file exists, and hashes have been supplied
hashname, sep, hashval = metadata_info.partition("=")
if sep == "=":
metadata_file_data = MetadataFile(supported_hashes({hashname: hashval}))
else:
# Error - data is wrong. Treat as no hashes supplied.
logger.debug(
"Index returned invalid data-dist-info-metadata value: %s",
metadata_info,
)
metadata_file_data = MetadataFile(None)

return cls(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
dist_info_metadata=dist_info_metadata,
metadata_file_data=metadata_file_data,
)

def __str__(self) -> str:
Expand Down Expand Up @@ -407,17 +461,13 @@ def subdirectory_fragment(self) -> Optional[str]:
return match.group(1)

def metadata_link(self) -> Optional["Link"]:
"""Implementation of PEP 658 parsing."""
# Note that Link.from_element() parsing the "data-dist-info-metadata" attribute
# from an HTML anchor tag is typically how the Link.dist_info_metadata attribute
# gets set.
if self.dist_info_metadata is None:
"""Return a link to the associated core metadata file (if any)."""
if self.metadata_file_data is None:
return None
metadata_url = f"{self.url_without_fragment}.metadata"
metadata_link_hash = LinkHash.parse_pep658_hash(self.dist_info_metadata)
if metadata_link_hash is None:
if self.metadata_file_data.hashes is None:
return Link(metadata_url)
return Link(metadata_url, hashes=metadata_link_hash.as_dict())
return Link(metadata_url, hashes=self.metadata_file_data.hashes)

def as_hashes(self) -> Hashes:
return Hashes({k: [v] for k, v in self._hashes.items()})
Expand Down
97 changes: 76 additions & 21 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pip._internal.models.link import (
Link,
LinkHash,
MetadataFile,
_clean_url_path,
_ensure_quoted_url,
)
Expand Down Expand Up @@ -485,13 +486,30 @@ def test_parse_links_json() -> None:
"requires-python": ">=3.7",
"dist-info-metadata": False,
},
# Same as above, but parsing dist-info-metadata.
# Same as above, but parsing core-metadata.
{
"filename": "holygrail-1.0-py3-none-any.whl",
"url": "/files/holygrail-1.0-py3-none-any.whl",
"hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
"requires-python": ">=3.7",
"dist-info-metadata": "sha512=aabdd41",
"core-metadata": {"sha512": "aabdd41"},
},
# Ensure fallback to dist-info-metadata works
{
"filename": "holygrail-1.0-py3-none-any.whl",
"url": "/files/holygrail-1.0-py3-none-any.whl",
"hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
"requires-python": ">=3.7",
"dist-info-metadata": {"sha512": "aabdd41"},
},
# Ensure that core-metadata gets priority.
{
"filename": "holygrail-1.0-py3-none-any.whl",
"url": "/files/holygrail-1.0-py3-none-any.whl",
"hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
"requires-python": ">=3.7",
"core-metadata": {"sha512": "aabdd41"},
"dist-info-metadata": {"sha512": "this_is_wrong"},
},
],
}
Expand Down Expand Up @@ -527,7 +545,23 @@ def test_parse_links_json() -> None:
requires_python=">=3.7",
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
dist_info_metadata="sha512=aabdd41",
metadata_file_data=MetadataFile({"sha512": "aabdd41"}),
),
Link(
"https://example.com/files/holygrail-1.0-py3-none-any.whl",
comes_from=page.url,
requires_python=">=3.7",
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
metadata_file_data=MetadataFile({"sha512": "aabdd41"}),
),
Link(
"https://example.com/files/holygrail-1.0-py3-none-any.whl",
comes_from=page.url,
requires_python=">=3.7",
yanked_reason=None,
hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
metadata_file_data=MetadataFile({"sha512": "aabdd41"}),
),
]

Expand Down Expand Up @@ -585,30 +619,42 @@ def test_parse_links__yanked_reason(anchor_html: str, expected: Optional[str]) -
),
# Test with value "true".
(
'<a href="/pkg1-1.0.tar.gz" data-dist-info-metadata="true"></a>',
"true",
'<a href="/pkg1-1.0.tar.gz" data-core-metadata="true"></a>',
MetadataFile(None),
{},
),
# Test with a provided hash value.
(
'<a href="/pkg1-1.0.tar.gz" data-dist-info-metadata="sha256=aa113592bbe"></a>', # noqa: E501
"sha256=aa113592bbe",
'<a href="/pkg1-1.0.tar.gz" data-core-metadata="sha256=aa113592bbe"></a>', # noqa: E501
MetadataFile({"sha256": "aa113592bbe"}),
{},
),
# Test with a provided hash value for both the requirement as well as metadata.
(
'<a href="/pkg1-1.0.tar.gz#sha512=abc132409cb" data-dist-info-metadata="sha256=aa113592bbe"></a>', # noqa: E501
"sha256=aa113592bbe",
'<a href="/pkg1-1.0.tar.gz#sha512=abc132409cb" data-core-metadata="sha256=aa113592bbe"></a>', # noqa: E501
MetadataFile({"sha256": "aa113592bbe"}),
{"sha512": "abc132409cb"},
),
# Ensure the fallback to the old name works.
(
'<a href="/pkg1-1.0.tar.gz" data-dist-info-metadata="sha256=aa113592bbe"></a>', # noqa: E501
MetadataFile({"sha256": "aa113592bbe"}),
{},
),
# Ensure that the data-core-metadata name gets priority.
(
'<a href="/pkg1-1.0.tar.gz" data-core-metadata="sha256=aa113592bbe" data-dist-info-metadata="sha256=invalid_value"></a>', # noqa: E501
MetadataFile({"sha256": "aa113592bbe"}),
{},
),
],
)
def test_parse_links__dist_info_metadata(
def test_parse_links__metadata_file_data(
anchor_html: str,
expected: Optional[str],
hashes: Dict[str, str],
) -> None:
link = _test_parse_links_data_attribute(anchor_html, "dist_info_metadata", expected)
link = _test_parse_links_data_attribute(anchor_html, "metadata_file_data", expected)
assert link._hashes == hashes


Expand Down Expand Up @@ -1080,17 +1126,26 @@ def test_link_hash_parsing(url: str, result: Optional[LinkHash]) -> None:


@pytest.mark.parametrize(
"dist_info_metadata, result",
"metadata_attrib, expected",
[
("sha256=aa113592bbe", LinkHash("sha256", "aa113592bbe")),
("sha256=", LinkHash("sha256", "")),
("sha500=aa113592bbe", None),
("true", None),
("", None),
("aa113592bbe", None),
("sha256=aa113592bbe", MetadataFile({"sha256": "aa113592bbe"})),
("sha256=", MetadataFile({"sha256": ""})),
("sha500=aa113592bbe", MetadataFile(None)),
("true", MetadataFile(None)),
(None, None),
# Attribute is present but invalid
("", MetadataFile(None)),
("aa113592bbe", MetadataFile(None)),
],
)
def test_pep658_hash_parsing(
dist_info_metadata: str, result: Optional[LinkHash]
def test_metadata_file_info_parsing_html(
metadata_attrib: str, expected: Optional[MetadataFile]
) -> None:
assert LinkHash.parse_pep658_hash(dist_info_metadata) == result
attribs: Dict[str, Optional[str]] = {
"href": "something",
"data-dist-info-metadata": metadata_attrib,
}
page_url = "dummy_for_comes_from"
base_url = "https://index.url/simple"
link = Link.from_element(attribs, page_url, base_url)
assert link is not None and link.metadata_file_data == expected