Skip to content

Commit

Permalink
260 Parallelize multiple source fetch requests (#264)
Browse files Browse the repository at this point in the history
* First pass at parallelizing multiple source fetches

* Moves to use ThreadPoolExecutor model over ThreadPool so that the script can be exited gracefully on a network failure

* Adds notes on slow tests

* Adds a --dry-run flag that can dump a modified recipe to the console instead of saving it to a file
  • Loading branch information
schuylermartin45 authored Dec 5, 2024
1 parent 2a446b6 commit 69d7184
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 18 deletions.
67 changes: 52 additions & 15 deletions conda_recipe_manager/commands/bump_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

import concurrent.futures as cf
import logging
import sys
import time
Expand Down Expand Up @@ -246,6 +247,20 @@ def _update_sha256_check_hash_var(
return False


def _update_sha256_fetch_one(src_path: str, fetcher: HttpArtifactFetcher, retry_interval: float) -> tuple[str, str]:
"""
Helper function that retrieves a single HTTP source artifact, so that we can parallelize network requests.
:param src_path: Recipe key path to the applicable artifact source.
:param fetcher: Artifact fetching instance to use.
:param retry_interval: Scalable interval between fetch requests.
:raises FetchError: In the event that the retry mechanism failed to fetch a source artifact.
:returns: A tuple containing the path to and the actual SHA-256 value to be updated.
"""
sha = _get_sha256(fetcher, retry_interval)
return (RecipeParser.append_to_path(src_path, "/sha256"), sha)


def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None:
"""
Attempts to update the SHA-256 hash(s) in the `/source` section of a recipe file, if applicable. Note that this is
Expand All @@ -270,28 +285,39 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None:
# different hashes might imply there is a security threat. We will log some statistics so the user can best decide
# what to do.
unique_hashes: set[str] = set()
total_hash_cntr = 0
# TODO parallelize requests for multiple sources
for src_path, fetcher in fetcher_tbl.items():
if not isinstance(fetcher, HttpArtifactFetcher):
continue

try:
sha = _get_sha256(fetcher, retry_interval)
except FetchError:
_exit_on_failed_fetch(fetcher)
total_hash_cntr += 1
unique_hashes.add(sha)
sha_path = RecipeParser.append_to_path(src_path, "/sha256")
# Filter-out artifacts that don't need a SHA-256 hash.
http_fetcher_tbl: Final[dict[str, HttpArtifactFetcher]] = {
k: v for k, v in fetcher_tbl.items() if isinstance(v, HttpArtifactFetcher)
}
# Parallelize on acquiring multiple source artifacts on the network. In testing, using a process pool took
# significantly more time and resources. That aligns with how I/O bound this process is. We use the
# `ThreadPoolExecutor` class over a `ThreadPool` so the script may exit gracefully if we failed to acquire an
# artifact.
sha_path_to_sha_tbl: dict[str, str] = {}
with cf.ThreadPoolExecutor() as executor:
artifact_futures_tbl = {
executor.submit(_update_sha256_fetch_one, src_path, fetcher, retry_interval): fetcher
for src_path, fetcher in http_fetcher_tbl.items()
}
for future in cf.as_completed(artifact_futures_tbl):
fetcher = artifact_futures_tbl[future]
try:
resolved_tuple = future.result()
sha_path_to_sha_tbl[resolved_tuple[0]] = resolved_tuple[1]
except FetchError:
_exit_on_failed_fetch(fetcher)

for sha_path, sha in sha_path_to_sha_tbl.items():
unique_hashes.add(sha)
# Guard against the unlikely scenario that the `sha256` field is missing.
patch_op = "replace" if recipe_parser.contains_value(sha_path) else "add"
_exit_on_failed_patch(recipe_parser, {"op": patch_op, "path": sha_path, "value": sha})

log.info(
"Found %d unique SHA-256 hash(es) out of a total of %d hash(es) in %d sources.",
len(unique_hashes),
total_hash_cntr,
len(sha_path_to_sha_tbl),
len(fetcher_tbl),
)

Expand All @@ -308,6 +334,12 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None:
is_flag=True,
help="Bump the build number by 1.",
)
@click.option(
"-d",
"--dry-run",
is_flag=True,
help="Performs a dry-run operation that prints the recipe to STDOUT and does not save to the recipe file.",
)
@click.option(
"-t",
"--target-version",
Expand All @@ -327,7 +359,9 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None:
f" Defaults to {_DEFAULT_RETRY_INTERVAL} seconds"
),
)
def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional[str], retry_interval: float) -> None:
def bump_recipe(
recipe_file_path: str, build_num: bool, dry_run: bool, target_version: Optional[str], retry_interval: float
) -> None:
"""
Bumps a recipe to a new version.
Expand Down Expand Up @@ -367,5 +401,8 @@ def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional
_update_version(recipe_parser, target_version)
_update_sha256(recipe_parser, retry_interval)

Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8")
if dry_run:
print(recipe_parser.render())
else:
Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8")
sys.exit(ExitCode.SUCCESS)
7 changes: 4 additions & 3 deletions tests/commands/test_bump_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def test_usage() -> None:
# Does not use `version` variable, has a non-zero build number. Note that the URL is not parameterized on the
# version field.
("gsm-amzn2-aarch64.yaml", None, "bump_recipe/gsm-amzn2-aarch64_build_num_6.yaml"),
# TODO Fix this slow test tracked by Issue #265
("gsm-amzn2-aarch64.yaml", "2.0.20210721.2", "bump_recipe/gsm-amzn2-aarch64_version_bump.yaml"),
# Has a `sha256` variable
("pytest-pep8.yaml", None, "bump_recipe/pytest-pep8_build_num_2.yaml"),
Expand All @@ -102,6 +103,7 @@ def test_usage() -> None:
("curl.yaml", "8.11.0", "bump_recipe/curl_version_bump.yaml"),
# NOTE: libprotobuf has multiple sources, on top of being multi-output
("libprotobuf.yaml", None, "bump_recipe/libprotobuf_build_num_1.yaml"),
# TODO Fix this slow test tracked by Issue #265
("libprotobuf.yaml", "25.3", "bump_recipe/libprotobuf_version_bump.yaml"),
# Validates removal of `hash_type` variable that is sometimes used instead of the `/source/sha256` key
("types-toml_hash_type.yaml", None, "bump_recipe/types-toml_hash_type_build_num_1.yaml"),
Expand Down Expand Up @@ -158,9 +160,8 @@ def test_bump_recipe_cli(
[
("bump_recipe/types-toml_bad_url.yaml", "0.10.8.20240310", 5),
("bump_recipe/types-toml_bad_url_hash_var.yaml", "0.10.8.20240310", 5),
# As of writing, the script will fail on the first URL that fails to be fetched, thus the count is half what
# one might expect.
("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 5),
# Note that with futures, all 10 (5 by 2 sources) should occur by the time the futures are fully resolved.
("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 10),
# TODO validate V1 recipe files
],
)
Expand Down

0 comments on commit 69d7184

Please sign in to comment.