From 69d71847dcf2d376ef559494a17ec281c9da69d8 Mon Sep 17 00:00:00 2001 From: Schuyler Martin Date: Thu, 5 Dec 2024 09:04:47 -0700 Subject: [PATCH] 260 Parallelize multiple source fetch requests (#264) * First pass at parallelizing multiple source fetches * Moves to use ThreadPoolExecutor model over ThreadPool so that the script can be exited gracefully on a network failure * Adds notes on slow tests * Adds a --dry-run flag that can dump a modified recipe to the console instead of saving it to a file --- conda_recipe_manager/commands/bump_recipe.py | 67 +++++++++++++++----- tests/commands/test_bump_recipe.py | 7 +- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/conda_recipe_manager/commands/bump_recipe.py b/conda_recipe_manager/commands/bump_recipe.py index 98cf9562..a92a2139 100644 --- a/conda_recipe_manager/commands/bump_recipe.py +++ b/conda_recipe_manager/commands/bump_recipe.py @@ -4,6 +4,7 @@ from __future__ import annotations +import concurrent.futures as cf import logging import sys import time @@ -246,6 +247,20 @@ def _update_sha256_check_hash_var( return False +def _update_sha256_fetch_one(src_path: str, fetcher: HttpArtifactFetcher, retry_interval: float) -> tuple[str, str]: + """ + Helper function that retrieves a single HTTP source artifact, so that we can parallelize network requests. + + :param src_path: Recipe key path to the applicable artifact source. + :param fetcher: Artifact fetching instance to use. + :param retry_interval: Scalable interval between fetch requests. + :raises FetchError: In the event that the retry mechanism failed to fetch a source artifact. + :returns: A tuple containing the path to and the actual SHA-256 value to be updated. + """ + sha = _get_sha256(fetcher, retry_interval) + return (RecipeParser.append_to_path(src_path, "/sha256"), sha) + + def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: """ Attempts to update the SHA-256 hash(s) in the `/source` section of a recipe file, if applicable. Note that this is @@ -270,20 +285,31 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: # different hashes might imply there is a security threat. We will log some statistics so the user can best decide # what to do. unique_hashes: set[str] = set() - total_hash_cntr = 0 - # TODO parallelize requests for multiple sources - for src_path, fetcher in fetcher_tbl.items(): - if not isinstance(fetcher, HttpArtifactFetcher): - continue - try: - sha = _get_sha256(fetcher, retry_interval) - except FetchError: - _exit_on_failed_fetch(fetcher) - total_hash_cntr += 1 - unique_hashes.add(sha) - sha_path = RecipeParser.append_to_path(src_path, "/sha256") + # Filter-out artifacts that don't need a SHA-256 hash. + http_fetcher_tbl: Final[dict[str, HttpArtifactFetcher]] = { + k: v for k, v in fetcher_tbl.items() if isinstance(v, HttpArtifactFetcher) + } + # Parallelize on acquiring multiple source artifacts on the network. In testing, using a process pool took + # significantly more time and resources. That aligns with how I/O bound this process is. We use the + # `ThreadPoolExecutor` class over a `ThreadPool` so the script may exit gracefully if we failed to acquire an + # artifact. + sha_path_to_sha_tbl: dict[str, str] = {} + with cf.ThreadPoolExecutor() as executor: + artifact_futures_tbl = { + executor.submit(_update_sha256_fetch_one, src_path, fetcher, retry_interval): fetcher + for src_path, fetcher in http_fetcher_tbl.items() + } + for future in cf.as_completed(artifact_futures_tbl): + fetcher = artifact_futures_tbl[future] + try: + resolved_tuple = future.result() + sha_path_to_sha_tbl[resolved_tuple[0]] = resolved_tuple[1] + except FetchError: + _exit_on_failed_fetch(fetcher) + for sha_path, sha in sha_path_to_sha_tbl.items(): + unique_hashes.add(sha) # Guard against the unlikely scenario that the `sha256` field is missing. patch_op = "replace" if recipe_parser.contains_value(sha_path) else "add" _exit_on_failed_patch(recipe_parser, {"op": patch_op, "path": sha_path, "value": sha}) @@ -291,7 +317,7 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: log.info( "Found %d unique SHA-256 hash(es) out of a total of %d hash(es) in %d sources.", len(unique_hashes), - total_hash_cntr, + len(sha_path_to_sha_tbl), len(fetcher_tbl), ) @@ -308,6 +334,12 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: is_flag=True, help="Bump the build number by 1.", ) +@click.option( + "-d", + "--dry-run", + is_flag=True, + help="Performs a dry-run operation that prints the recipe to STDOUT and does not save to the recipe file.", +) @click.option( "-t", "--target-version", @@ -327,7 +359,9 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: f" Defaults to {_DEFAULT_RETRY_INTERVAL} seconds" ), ) -def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional[str], retry_interval: float) -> None: +def bump_recipe( + recipe_file_path: str, build_num: bool, dry_run: bool, target_version: Optional[str], retry_interval: float +) -> None: """ Bumps a recipe to a new version. @@ -367,5 +401,8 @@ def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional _update_version(recipe_parser, target_version) _update_sha256(recipe_parser, retry_interval) - Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8") + if dry_run: + print(recipe_parser.render()) + else: + Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8") sys.exit(ExitCode.SUCCESS) diff --git a/tests/commands/test_bump_recipe.py b/tests/commands/test_bump_recipe.py index 0ee3bebc..7f4b6588 100644 --- a/tests/commands/test_bump_recipe.py +++ b/tests/commands/test_bump_recipe.py @@ -87,6 +87,7 @@ def test_usage() -> None: # Does not use `version` variable, has a non-zero build number. Note that the URL is not parameterized on the # version field. ("gsm-amzn2-aarch64.yaml", None, "bump_recipe/gsm-amzn2-aarch64_build_num_6.yaml"), + # TODO Fix this slow test tracked by Issue #265 ("gsm-amzn2-aarch64.yaml", "2.0.20210721.2", "bump_recipe/gsm-amzn2-aarch64_version_bump.yaml"), # Has a `sha256` variable ("pytest-pep8.yaml", None, "bump_recipe/pytest-pep8_build_num_2.yaml"), @@ -102,6 +103,7 @@ def test_usage() -> None: ("curl.yaml", "8.11.0", "bump_recipe/curl_version_bump.yaml"), # NOTE: libprotobuf has multiple sources, on top of being multi-output ("libprotobuf.yaml", None, "bump_recipe/libprotobuf_build_num_1.yaml"), + # TODO Fix this slow test tracked by Issue #265 ("libprotobuf.yaml", "25.3", "bump_recipe/libprotobuf_version_bump.yaml"), # Validates removal of `hash_type` variable that is sometimes used instead of the `/source/sha256` key ("types-toml_hash_type.yaml", None, "bump_recipe/types-toml_hash_type_build_num_1.yaml"), @@ -158,9 +160,8 @@ def test_bump_recipe_cli( [ ("bump_recipe/types-toml_bad_url.yaml", "0.10.8.20240310", 5), ("bump_recipe/types-toml_bad_url_hash_var.yaml", "0.10.8.20240310", 5), - # As of writing, the script will fail on the first URL that fails to be fetched, thus the count is half what - # one might expect. - ("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 5), + # Note that with futures, all 10 (5 by 2 sources) should occur by the time the futures are fully resolved. + ("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 10), # TODO validate V1 recipe files ], )