Use retain_package_versions to filter out older pkgs at sync time

closes pulp#2479
dralley · May 26, 2022 · 0053b79 · 0053b79
1 parent 5a2914d
commit 0053b79
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 6 deletions.
diff --git a/CHANGES/2479.feature b/CHANGES/2479.feature
@@ -0,0 +1 @@
+Using `retain_package_versions` (with the required "additive" `sync_policy`) will now avoid downloading the older packages when synced with download_policy "on_demand", resulting in much faster and more efficient syncs.
diff --git a/pulp_rpm/app/metadata_parsing.py b/pulp_rpm/app/metadata_parsing.py
@@ -34,6 +34,16 @@ def from_metadata_files(primary_xml_path, filelists_xml_path, other_xml_path):
         parser.other_xml_path = other_xml_path
         return parser
 
+    def for_each_pkg_primary(self, callback):
+        """Execute a callback for each package, parsed from only primary package metadata.
+
+        Only primary metadata means no files or changelogs."""
+
+        def pkgcb(pkg):
+            callback(pkg)
+
+        cr.xml_parse_primary(self.primary_xml_path, pkgcb=pkgcb, do_files=False)
+
     def count_packages(self):
         """Count the total number of packages."""
         # It would be much faster to just read the number in the header of the metadata.

diff --git a/pulp_rpm/app/tasks/synchronizing.py b/pulp_rpm/app/tasks/synchronizing.py
@@ -1151,15 +1151,28 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
                 "Please read https://github.com/pulp/pulp_rpm/issues/2402 for more details."
             )
 
-            pkg_iterator = parser.as_iterator()
-
-            for pkg in pkg_iterator:
+            package_skip_nevras = set()
+            # The repository can contain packages of arbitrary arches, and they are not comparable.
+            # {"x86_64": {"glibc": [...]}, "i686": {"glibc": [...], "src": {"glibc": [...]}}
+            latest_packages_by_arch_and_name = defaultdict(lambda: defaultdict(list))
+
+            # Perform various checks and potentially filter out unwanted packages
+            # We parse all of primary.xml first and fail fast if something is wrong.
+            # Collect a list of any package nevras() we don't want to include.
+            def verification_and_skip_callback(pkg):
+                nonlocal pkgid_warning_triggered
+                nonlocal nevra_warning_triggered
+                nonlocal package_skip_nevras
+                nonlocal latest_packages_by_arch_and_name
+
+                # Check for packages with duplicate pkgids
                 if not pkgid_warning_triggered and pkg.pkgId in checksums:
                     pkgid_warning_triggered = True
                     if self.mirror_metadata:
                         raise Exception(ERR_MSG.format("PKGIDs"))
                     else:
                         log.warn(WARN_MSG.format("PKGIDs"))
+                # Check for packages with duplicate NEVRAs
                 if not nevra_warning_triggered and pkg.nevra() in nevras:
                     nevra_warning_triggered = True
                     if self.mirror_metadata:
@@ -1169,16 +1182,41 @@ async def parse_packages(self, primary_xml, filelists_xml, other_xml):
                 nevras.add(pkg.nevra())
                 checksums.add(pkg.pkgId)
 
-                if skip_srpms and pkg.arch == "src":
-                    continue
-
+                # Check that all packages are within the root of the repo (if in mirror_complete mode)
                 if self.mirror_metadata:
                     uses_base_url = pkg.location_base
                     illegal_relative_path = self.is_illegal_relative_path(pkg.location_href)
 
                     if uses_base_url or illegal_relative_path:
                         raise ValueError(MIRROR_INCOMPATIBLE_REPO_ERR_MSG)
 
+                # Add any srpms to the skip set
+                if skip_srpms and pkg.arch == "src":
+                    package_skip_nevras.add(pkg.nevra())
+
+                # Collect the N highest-version packages, kick out the older ones and add those
+                # to the skip list
+                if self.repository.retain_package_versions:
+                    comparables = latest_packages_by_arch_and_name[pkg.arch][pkg.name]
+                    if len(comparables) < self.repository.retain_package_versions:
+                        comparables.append(pkg)
+                        comparables.sort(lambda x: EVR(x.epoch, x.version, x.release), reverse=True)
+                    else:
+                        curr_evr = EVR(pkg.epoch, pkg.version, pkg.release)
+                        for idx in range(len(comparables)):
+                            other_pkg = comparables[idx]
+                            other_evr = EVR(other_pkg.epoch, other_pkg.version, other_pkg.release)
+                            if other_evr < curr_evr:
+                                package_skip_nevras.add(other_pkg.nevra())
+                                comparables[idx] = pkg
+                                break
+
+            parser.for_each_pkg_primary(verification_and_skip_callback)
+            del latest_packages_by_arch_and_name
+
+            for pkg in parser.as_iterator():
+                if pkg.nevra() in package_skip_nevras:
+                    continue
                 package = Package(**Package.createrepo_to_dict(pkg))
                 base_url = pkg.location_base or self.remote_url
                 url = urlpath_sanitize(base_url, package.location_href)