Skip to content

Commit

Permalink
Use retain_package_versions to filter out older pkgs at sync time
Browse files Browse the repository at this point in the history
closes pulp#2479
  • Loading branch information
dralley committed May 31, 2022
1 parent 5a2914d commit 02d345d
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGES/2479.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Using `retain_package_versions` (with the required "additive" `sync_policy`) will now avoid downloading the older packages when synced with download_policy "on_demand", resulting in much faster and more efficient syncs.
11 changes: 11 additions & 0 deletions pulp_rpm/app/metadata_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ def from_metadata_files(primary_xml_path, filelists_xml_path, other_xml_path):
parser.other_xml_path = other_xml_path
return parser

def for_each_pkg_primary(self, callback):
"""Execute a callback for each package, parsed from only primary package metadata.
Only primary metadata means no files or changelogs.
"""

def pkgcb(pkg):
callback(pkg)

cr.xml_parse_primary(self.primary_xml_path, pkgcb=pkgcb, do_files=False)

def count_packages(self):
"""Count the total number of packages."""
# It would be much faster to just read the number in the header of the metadata.
Expand Down
78 changes: 70 additions & 8 deletions pulp_rpm/app/tasks/synchronizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import createrepo_c as cr
import libcomps
from rpmrepo_metadata import EVR

from pulpcore.plugin.models import (
Artifact,
Expand Down Expand Up @@ -878,14 +879,16 @@ async def parse_repository_metadata(self, repomd, metadata_results):
# know that it is referenced in modulemd.
modulemd_list = []
modulemd_result = metadata_results.get("modules", None)
modulemd_all = []
if modulemd_result:
modulemd_list = await self.parse_modules_metadata(modulemd_result)
(modulemd_list, modulemd_all) = await self.parse_modules_metadata(modulemd_result)

# **Now** we can successfully parse package-metadata
await self.parse_packages(
metadata_results["primary"],
metadata_results["filelists"],
metadata_results["other"],
modulemd_list=modulemd_all,
)

groups_list = []
Expand Down Expand Up @@ -1012,7 +1015,7 @@ async def parse_modules_metadata(self, modulemd_result):
for default_content_dc in default_content_dcs:
await self.put(default_content_dc)

return modulemd_list
return (modulemd_list, modulemd_all)

async def parse_packages_components(self, comps_result):
"""Parse packages' components that define how are the packages bundled."""
Expand Down Expand Up @@ -1116,7 +1119,7 @@ async def parse_packages_components(self, comps_result):

return dc_groups

async def parse_packages(self, primary_xml, filelists_xml, other_xml):
async def parse_packages(self, primary_xml, filelists_xml, other_xml, modulemd_list=None):
"""Parse packages from the remote repository."""
parser = MetadataParser.from_metadata_files(
primary_xml.path, filelists_xml.path, other_xml.path
Expand Down Expand Up @@ -1151,15 +1154,33 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
"Please read https://github.com/pulp/pulp_rpm/issues/2402 for more details."
)

pkg_iterator = parser.as_iterator()
modular_artifact_nevras = set()

for pkg in pkg_iterator:
for modulemd in modulemd_list:
modular_artifact_nevras |= set(modulemd[PULP_MODULE_ATTR.ARTIFACTS])

package_skip_nevras = set()
# The repository can contain packages of arbitrary arches, and they are not comparable.
# {"x86_64": {"glibc": [...]}, "i686": {"glibc": [...], "src": {"glibc": [...]}}
latest_packages_by_arch_and_name = defaultdict(lambda: defaultdict(list))

# Perform various checks and potentially filter out unwanted packages
# We parse all of primary.xml first and fail fast if something is wrong.
# Collect a list of any package nevras() we don't want to include.
def verification_and_skip_callback(pkg):
nonlocal pkgid_warning_triggered
nonlocal nevra_warning_triggered
nonlocal package_skip_nevras
nonlocal latest_packages_by_arch_and_name

# Check for packages with duplicate pkgids
if not pkgid_warning_triggered and pkg.pkgId in checksums:
pkgid_warning_triggered = True
if self.mirror_metadata:
raise Exception(ERR_MSG.format("PKGIDs"))
else:
log.warn(WARN_MSG.format("PKGIDs"))
# Check for packages with duplicate NEVRAs
if not nevra_warning_triggered and pkg.nevra() in nevras:
nevra_warning_triggered = True
if self.mirror_metadata:
Expand All @@ -1169,16 +1190,57 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
nevras.add(pkg.nevra())
checksums.add(pkg.pkgId)

if skip_srpms and pkg.arch == "src":
continue

# Check that all packages are within the root of the repo (if in mirror_complete mode)
if self.mirror_metadata:
uses_base_url = pkg.location_base
illegal_relative_path = self.is_illegal_relative_path(pkg.location_href)

if uses_base_url or illegal_relative_path:
raise ValueError(MIRROR_INCOMPATIBLE_REPO_ERR_MSG)

# Add any srpms to the skip set
if skip_srpms and pkg.arch == "src":
package_skip_nevras.add(pkg.nevra())

# Collect the N highest-version packages, kick out the older ones and add those
# to the skip list
if self.repository.retain_package_versions:
comparables = latest_packages_by_arch_and_name[pkg.arch][pkg.name]
if len(comparables) < self.repository.retain_package_versions:
comparables.append(pkg)
else:
curr_evr = EVR(pkg.epoch, pkg.version, pkg.release)
comparables.sort(
key=lambda x: EVR(x.epoch, x.version, x.release), reverse=True
)
for idx in range(len(comparables)):
other_pkg = comparables[idx]
other_evr = EVR(other_pkg.epoch, other_pkg.version, other_pkg.release)
if curr_evr > other_evr:
package_skip_nevras.add(other_pkg.nevra())
comparables[idx] = pkg
break
else:
package_skip_nevras.add(pkg.nevra())

# Ew, callback-based API, gross. The streaming API doesn't support optionally
# specifying particular files yet so we have to use the old way.
parser.for_each_pkg_primary(verification_and_skip_callback)
del latest_packages_by_arch_and_name

# TODO: should any SRPMs that are part of modules be skipped if skip_srpms is enabled?
# This will remove any that are listed.
package_skip_nevras -= modular_artifact_nevras

log.debug(
"Skipping {} packages due to retain_package_versions".format(
len(package_skip_nevras)
)
)

for pkg in parser.as_iterator():
if package_skip_nevras and pkg.nevra() in package_skip_nevras:
continue
package = Package(**Package.createrepo_to_dict(pkg))
base_url = pkg.location_base or self.remote_url
url = urlpath_sanitize(base_url, package.location_href)
Expand Down

0 comments on commit 02d345d

Please sign in to comment.