Skip to content

Commit

Permalink
Use retain_package_versions to filter out older pkgs at sync time
Browse files Browse the repository at this point in the history
closes pulp#2479
  • Loading branch information
dralley committed May 26, 2022
1 parent 5a2914d commit 0053b79
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGES/2479.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Using `retain_package_versions` (with the required "additive" `sync_policy`) will now avoid downloading the older packages when synced with download_policy "on_demand", resulting in much faster and more efficient syncs.
10 changes: 10 additions & 0 deletions pulp_rpm/app/metadata_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ def from_metadata_files(primary_xml_path, filelists_xml_path, other_xml_path):
parser.other_xml_path = other_xml_path
return parser

def for_each_pkg_primary(self, callback):
"""Execute a callback for each package, parsed from only primary package metadata.
Only primary metadata means no files or changelogs."""

def pkgcb(pkg):
callback(pkg)

cr.xml_parse_primary(self.primary_xml_path, pkgcb=pkgcb, do_files=False)

def count_packages(self):
"""Count the total number of packages."""
# It would be much faster to just read the number in the header of the metadata.
Expand Down
50 changes: 44 additions & 6 deletions pulp_rpm/app/tasks/synchronizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,15 +1151,28 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
"Please read https://github.com/pulp/pulp_rpm/issues/2402 for more details."
)

pkg_iterator = parser.as_iterator()

for pkg in pkg_iterator:
package_skip_nevras = set()
# The repository can contain packages of arbitrary arches, and they are not comparable.
# {"x86_64": {"glibc": [...]}, "i686": {"glibc": [...], "src": {"glibc": [...]}}
latest_packages_by_arch_and_name = defaultdict(lambda: defaultdict(list))

# Perform various checks and potentially filter out unwanted packages
# We parse all of primary.xml first and fail fast if something is wrong.
# Collect a list of any package nevras() we don't want to include.
def verification_and_skip_callback(pkg):
nonlocal pkgid_warning_triggered
nonlocal nevra_warning_triggered
nonlocal package_skip_nevras
nonlocal latest_packages_by_arch_and_name

# Check for packages with duplicate pkgids
if not pkgid_warning_triggered and pkg.pkgId in checksums:
pkgid_warning_triggered = True
if self.mirror_metadata:
raise Exception(ERR_MSG.format("PKGIDs"))
else:
log.warn(WARN_MSG.format("PKGIDs"))
# Check for packages with duplicate NEVRAs
if not nevra_warning_triggered and pkg.nevra() in nevras:
nevra_warning_triggered = True
if self.mirror_metadata:
Expand All @@ -1169,16 +1182,41 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
nevras.add(pkg.nevra())
checksums.add(pkg.pkgId)

if skip_srpms and pkg.arch == "src":
continue

# Check that all packages are within the root of the repo (if in mirror_complete mode)
if self.mirror_metadata:
uses_base_url = pkg.location_base
illegal_relative_path = self.is_illegal_relative_path(pkg.location_href)

if uses_base_url or illegal_relative_path:
raise ValueError(MIRROR_INCOMPATIBLE_REPO_ERR_MSG)

# Add any srpms to the skip set
if skip_srpms and pkg.arch == "src":
package_skip_nevras.add(pkg.nevra())

# Collect the N highest-version packages, kick out the older ones and add those
# to the skip list
if self.repository.retain_package_versions:
comparables = latest_packages_by_arch_and_name[pkg.arch][pkg.name]
if len(comparables) < self.repository.retain_package_versions:
comparables.append(pkg)
comparables.sort(lambda x: EVR(x.epoch, x.version, x.release), reverse=True)
else:
curr_evr = EVR(pkg.epoch, pkg.version, pkg.release)
for idx in range(len(comparables)):
other_pkg = comparables[idx]
other_evr = EVR(other_pkg.epoch, other_pkg.version, other_pkg.release)
if other_evr < curr_evr:
package_skip_nevras.add(other_pkg.nevra())
comparables[idx] = pkg
break

parser.for_each_pkg_primary(verification_and_skip_callback)
del latest_packages_by_arch_and_name

for pkg in parser.as_iterator():
if pkg.nevra() in package_skip_nevras:
continue
package = Package(**Package.createrepo_to_dict(pkg))
base_url = pkg.location_base or self.remote_url
url = urlpath_sanitize(base_url, package.location_href)
Expand Down

0 comments on commit 0053b79

Please sign in to comment.