From 0cdaf529577f67800b58d1f8d2e10da189f2c1d3 Mon Sep 17 00:00:00 2001 From: Grant Gainey Date: Mon, 31 Oct 2022 10:59:37 -0400 Subject: [PATCH] Fixed concurrent-ks-tree-syncs by making disttree ID repo-specific. DistributionTree digest and subrepo-names now both end with the pulp-id of the "owning" Repository, making them unique to that repo and therefore protected from concurrent-updates against anything that is changing that Repository. Addon/Variant/Image are transitively made unique by virtue of having their DistributionTree be part of their unique-together. Sub-repo **content** (e.g. Packages et al) are de-duplicated via their existing uniqueness constraints. The end result is a minor increase in Content objects (i.e., DistTrees/Addons/Images/Variants that used to have only one instance are now one-per-containing-repo), and a small impact on subrepo-syncing (since previously-unique subrepos will now have a first-sync that would have been skipped). Content will continue to only be sync'd once. fixes #2278. [nocoverage] --- CHANGES/2278.bugfix | 3 +++ pulp_rpm/app/tasks/synchronizing.py | 26 +++++++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) create mode 100644 CHANGES/2278.bugfix diff --git a/CHANGES/2278.bugfix b/CHANGES/2278.bugfix new file mode 100644 index 000000000..63c051b93 --- /dev/null +++ b/CHANGES/2278.bugfix @@ -0,0 +1,3 @@ +Fixed concurrent-overlapping-sync of subrepos by making them repository-unique. + +This change is transparent to end-users. diff --git a/pulp_rpm/app/tasks/synchronizing.py b/pulp_rpm/app/tasks/synchronizing.py index c0ce020f0..cae5cb0fc 100644 --- a/pulp_rpm/app/tasks/synchronizing.py +++ b/pulp_rpm/app/tasks/synchronizing.py @@ -484,24 +484,16 @@ def is_subrepo(directory): with tempfile.TemporaryDirectory(dir="."): remote_url = fetch_remote_url(remote, url) - sync_details = get_sync_details(remote, remote_url, sync_policy, repository) - - repo_sync_config[PRIMARY_REPO] = { - "should_skip": should_optimize_sync(sync_details, repository.last_sync_details), - "sync_details": sync_details, - "url": remote_url, - "repo": repository, - } + # Find and set up to deal with any subtrees treeinfo = get_treeinfo_data(remote, remote_url) - if treeinfo: treeinfo["repositories"] = {} for repodata in set(treeinfo["download"]["repodatas"]): if repodata == DIST_TREE_MAIN_REPO_PATH: treeinfo["repositories"].update({repodata: None}) continue - name = f"{repodata}-{treeinfo['hash']}" + name = f"{repodata}-{treeinfo['hash']}-{repository.pulp_id}" sub_repo, created = RpmRepository.objects.get_or_create(name=name, user_hidden=True) if created: sub_repo.save() @@ -527,6 +519,15 @@ def is_subrepo(directory): "repo": sub_repo, } + # Set up to deal with the primary repository + sync_details = get_sync_details(remote, remote_url, sync_policy, repository) + repo_sync_config[PRIMARY_REPO] = { + "should_skip": should_optimize_sync(sync_details, repository.last_sync_details), + "sync_details": sync_details, + "url": remote_url, + "repo": repository, + } + # If all repos are exactly the same, we should skip all further processing, even in # metadata-mirror mode if optimize and all([config["should_skip"] for config in repo_sync_config.values()]): @@ -541,6 +542,8 @@ def is_subrepo(directory): repo_sync_results = {} # If some repos need to be synced and others do not, we go through them all + # items() returns in insertion-order - make sure PRIMARY is the LAST thing we process + # here, or autopublish will fail to find any subrepo-content. for directory, repo_config in repo_sync_config.items(): repo = repo_config["repo"] # If metadata_mirroring is enabled we cannot skip any syncs, because the generated @@ -878,7 +881,8 @@ async def parse_distribution_tree(self): ) d_artifacts.append(da) - self.treeinfo["distribution_tree"]["digest"] = self.treeinfo["hash"] + tree_digest = f'{self.treeinfo["hash"]}-{self.repository.pulp_id}' + self.treeinfo["distribution_tree"]["digest"] = tree_digest distribution_tree = DistributionTree(**self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo