Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: performance regression when parsing links from legacy repositories #6442

Merged
merged 4 commits into from
Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ python = "^3.7"

poetry-core = "^1.1.0"
poetry-plugin-export = "^1.0.6"
"backports.cached-property" = { version = "^1.0.2", python = "<3.8" }
cachecontrol = { version = "^0.12.9", extras = ["filecache"] }
cachy = "^0.3.0"
cleo = "^1.0.0a5"
Expand Down
33 changes: 15 additions & 18 deletions src/poetry/repositories/link_sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import logging
import re

from abc import abstractmethod
from typing import TYPE_CHECKING
from typing import DefaultDict
from typing import List

from packaging.utils import canonicalize_name
from poetry.core.packages.package import Package
from poetry.core.semver.version import Version

from poetry.utils._compat import cached_property
from poetry.utils.patterns import sdist_file_re
from poetry.utils.patterns import wheel_file_re

Expand All @@ -20,6 +22,8 @@
from packaging.utils import NormalizedName
from poetry.core.packages.utils.link import Link

LinkCache = DefaultDict[NormalizedName, DefaultDict[Version, List[Link]]]


logger = logging.getLogger(__name__)

Expand All @@ -44,16 +48,8 @@ def __init__(self, url: str) -> None:
def url(self) -> str:
return self._url

def versions(self, name: str) -> Iterator[Version]:
name = canonicalize_name(name)
seen: set[Version] = set()

for link in self.links:
pkg = self.link_package_data(link)

if pkg and pkg.name == name and pkg.version not in seen:
seen.add(pkg.version)
yield pkg.version
def versions(self, name: NormalizedName) -> Iterator[Version]:
yield from self._link_cache[name]

@property
def packages(self) -> Iterator[Package]:
Expand All @@ -64,9 +60,10 @@ def packages(self) -> Iterator[Package]:
yield pkg

@property
@abstractmethod
def links(self) -> Iterator[Link]:
raise NotImplementedError()
for links_per_version in self._link_cache.values():
for links in links_per_version.values():
yield from links

@classmethod
def link_package_data(cls, link: Link) -> Package | None:
Expand Down Expand Up @@ -102,11 +99,7 @@ def link_package_data(cls, link: Link) -> Package | None:
def links_for_version(
self, name: NormalizedName, version: Version
) -> Iterator[Link]:
for link in self.links:
pkg = self.link_package_data(link)

if pkg and pkg.name == name and pkg.version == version:
yield link
yield from self._link_cache[name][version]

def clean_link(self, url: str) -> str:
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
Expand All @@ -127,3 +120,7 @@ def yanked(self, name: NormalizedName, version: Version) -> str | bool:
if reasons:
return "\n".join(sorted(reasons))
return True

@cached_property
def _link_cache(self) -> LinkCache:
raise NotImplementedError()
16 changes: 12 additions & 4 deletions src/poetry/repositories/link_sources/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
import urllib.parse
import warnings

from collections import defaultdict
from html import unescape
from typing import TYPE_CHECKING

from poetry.core.packages.utils.link import Link

from poetry.repositories.link_sources.base import LinkSource
from poetry.utils._compat import cached_property


if TYPE_CHECKING:
from collections.abc import Iterator
from poetry.repositories.link_sources.base import LinkCache


with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand All @@ -25,8 +28,9 @@ def __init__(self, url: str, content: str) -> None:

self._parsed = html5lib.parse(content, namespaceHTMLElements=False)

@property
def links(self) -> Iterator[Link]:
@cached_property
def _link_cache(self) -> LinkCache:
links: LinkCache = defaultdict(lambda: defaultdict(list))
for anchor in self._parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
Expand All @@ -44,7 +48,11 @@ def links(self) -> Iterator[Link]:
if link.ext not in self.SUPPORTED_FORMATS:
continue

yield link
pkg = self.link_package_data(link)
if pkg:
links[pkg.name][pkg.version].append(link)

return links


class SimpleRepositoryPage(HTMLPage):
Expand Down
16 changes: 15 additions & 1 deletion src/poetry/utils/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
else:
from importlib import metadata

if sys.version_info < (3, 8):
# compatibility for python <3.8
from backports.cached_property import cached_property
else:
from functools import cached_property

WINDOWS = sys.platform == "win32"


Expand Down Expand Up @@ -53,4 +59,12 @@ def list_to_shell_command(cmd: list[str]) -> str:
)


__all__ = ["WINDOWS", "decode", "encode", "list_to_shell_command", "metadata", "to_str"]
__all__ = [
"WINDOWS",
"cached_property",
"decode",
"encode",
"list_to_shell_command",
"metadata",
"to_str",
]
27 changes: 17 additions & 10 deletions tests/repositories/link_sources/test_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING
from unittest.mock import PropertyMock

Expand All @@ -24,16 +25,22 @@ def link_source(mocker: MockerFixture) -> LinkSource:
url = "https://example.org"
link_source = LinkSource(url)
mocker.patch(
f"{LinkSource.__module__}.{LinkSource.__qualname__}.links",
f"{LinkSource.__module__}.{LinkSource.__qualname__}._link_cache",
new_callable=PropertyMock,
return_value=iter(
[
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0_invalid.tar.gz"),
Link(f"{url}/invalid.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
Link(f"{url}/demo-0.1.1.tar.gz"),
]
return_value=defaultdict(
lambda: defaultdict(list),
{
canonicalize_name("demo"): defaultdict(
list,
{
Version.parse("0.1.0"): [
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
],
Version.parse("0.1.1"): [Link(f"{url}/demo-0.1.1.tar.gz")],
},
),
},
),
)
return link_source
Expand Down Expand Up @@ -63,7 +70,7 @@ def test_link_package_data(filename: str, expected: Package | None) -> None:
],
)
def test_versions(name: str, expected: set[Version], link_source: LinkSource) -> None:
assert set(link_source.versions(name)) == expected
assert set(link_source.versions(canonicalize_name(name))) == expected


def test_packages(link_source: LinkSource) -> None:
Expand Down
3 changes: 2 additions & 1 deletion tests/repositories/link_sources/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from packaging.utils import canonicalize_name
from poetry.core.packages.utils.link import Link
from poetry.core.semver.version import Version

Expand Down Expand Up @@ -90,4 +91,4 @@ def test_yanked(yanked_attrs: tuple[str, str], expected: bool | str) -> None:
content = DEMO_TEMPLATE.format(anchors)
page = HTMLPage("https://example.org", content)

assert page.yanked("demo", Version.parse("0.1")) == expected
assert page.yanked(canonicalize_name("demo"), Version.parse("0.1")) == expected
17 changes: 2 additions & 15 deletions tests/repositories/test_legacy_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,25 +102,12 @@ def test_page_invalid_version_link() -> None:
assert page is not None

links = list(page.links)
assert len(links) == 2
assert len(links) == 1

versions = list(page.versions("poetry"))
versions = list(page.versions(canonicalize_name("poetry")))
assert len(versions) == 1
assert versions[0].to_string() == "0.1.0"

invalid_link = None

for link in links:
if link.filename.startswith("poetry-21"):
invalid_link = link
break

links_010 = list(page.links_for_version(canonicalize_name("poetry"), versions[0]))
assert invalid_link not in links_010

assert invalid_link
assert not page.link_package_data(invalid_link)

packages = list(page.packages)
assert len(packages) == 1
assert packages[0].name == "poetry"
Expand Down