Skip to content

Commit

Permalink
Artifact Fetching Module (#118)
Browse files Browse the repository at this point in the history
* Initial outline for the first fetching module

* Breaks initial outline up into several sub-classes to better manage the differences in artifact type

* Adds some in-progress documentation

* Flushes out fetch exceptions

* Sures-up documentation in the artifact base class

* Finishes templat for git Artifact Fetcher

* Starts work, in-earnest, on the HTTP artifact fetcher

* First-pass implementation of the _extract() function

* Implements get_archive_sha256()

* Adds documentation

* Fetcher exceptions are now under a dedicated exception module

* Fixes current linting errors

* Fixes static analyzer issues

* The RecipeReader.get() call in from_recipe() now uses sub_vars to resolve JINJA variables

* README changes

* Fixes an issue with deriving archive names from URLs

* Fixes more extraction naming issues

* Introduces hash utility library to standardize hashing usage. Removes existing hash buffering work.

* Adds unit tests for new hashing utility module

* Remove unnessecary uses of pass. This gives a much more accurate test-coverage statistic for abstract classes

* Starts work on artifact fetching unit tests

* Adds advanced HTTP mockers

* Adds unit test for fetch() that mocks the file system and HTTP requests

* Adds missing pyfakefs requirement to the recipe file

* Fixing build test by including conda-forge

* Adds fetch failure unit tests

* Refactors test_fetch() to use test params instead of fixture params for more dynamic expected value checking

* Adds test_get_path_to_source_code() to test the happy-path of that function

* Adds unit test for get_archive_sha256()

* Adds unit test for get_archive_type()

* Update conda_recipe_manager/fetcher/base_artifact_fetcher.py

Co-authored-by: Bianca Henderson <beeankha@gmail.com>

---------

Co-authored-by: Bianca Henderson <beeankha@gmail.com>
  • Loading branch information
schuylermartin45 and beeankha authored Sep 13, 2024
1 parent 1ccfc83 commit aa15b12
Show file tree
Hide file tree
Showing 22 changed files with 901 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/commit_checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
run: |
source $CONDA/bin/activate
conda install -y conda-build
conda build recipe/meta.yaml
conda build -c defaults -c conda-forge recipe/meta.yaml
# Eat our own dog food and build this project with rattler-build by converting our existing recipe.
build-recipe-rattler:
runs-on: ubuntu-latest
Expand Down
4 changes: 4 additions & 0 deletions conda_recipe_manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ Some of these CLIs are very demo-focused and others provide significant value, l
All commands defined are subcommands of the `conda-recipe-manager` command. The top-level command has also been
abbreviated to `crm` for your typing convenience.

### `fetcher` (WIP)
This module provides tools for fetching and normalizing remote resources. Files that are downloaded are done so using
secure temporary directories.

### `grapher` (WIP)
This module provides tools that are capable of plotting and understanding how recipe dependencies are related to each
other.
Expand Down
Empty file.
31 changes: 31 additions & 0 deletions conda_recipe_manager/fetcher/artifact_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
:Description: TODO
"""

from __future__ import annotations

from typing import Final, cast

from conda_recipe_manager.fetcher.base_artifact_fetcher import BaseArtifactFetcher
from conda_recipe_manager.parser.recipe_reader import RecipeReader
from conda_recipe_manager.types import Primitives

# Identifying string used to flag temp files and directories created by this module.
_ARTIFACT_FETCHER_FILE_ID: Final[str] = "crm_artifact_fetcher"


def from_recipe(recipe: RecipeReader) -> list[BaseArtifactFetcher]:
"""
TODO Complete: construct from a recipe file directly
"""
sources: list[BaseArtifactFetcher] = []
# TODO add source-specific parser?
parsed_sources = cast(
dict[str, Primitives] | list[dict[str, Primitives]], recipe.get_value("/source", sub_vars=True)
)
if not isinstance(parsed_sources, list):
parsed_sources = [parsed_sources]

for _ in parsed_sources:
pass
return sources
57 changes: 57 additions & 0 deletions conda_recipe_manager/fetcher/base_artifact_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
:Description: Provides a base class that all Artifact Fetcher are derived from.
"""

from __future__ import annotations

from abc import ABCMeta, abstractmethod
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Final

# Identifying string used to flag temp files and directories created by this module.
_ARTIFACT_FETCHER_FILE_ID: Final[str] = "crm_artifact_fetcher"


class BaseArtifactFetcher(metaclass=ABCMeta):
"""
Base class for all `ArtifactFetcher` classes. An `ArtifactFetcher` provides a standard set of tools to retrieve
bundles of source code.
Files retrieved from any artifact fetcher are stored in a secure temporary directory. That directory is deleted
when the Artifact Fetcher instance falls out of scope.
"""

def __init__(self, name: str) -> None:
"""
Constructs a BaseArtifactFetcher.
:param name: Identifies the artifact. Ideally, this is the package name. In multi-sourced/mirrored scenarios,
this might be the package name combined with some identifying information.
"""
self._name = name
# NOTE: There is an open issue about this pylint edge case: https://github.com/pylint-dev/pylint/issues/7658
self._temp_dir: Final[TemporaryDirectory[str]] = TemporaryDirectory( # pylint: disable=consider-using-with
prefix=f"{_ARTIFACT_FETCHER_FILE_ID}_", suffix=f"_{self._name}"
)
self._temp_dir_path: Final[Path] = Path(self._temp_dir.name)
# Flag to track if `fetch()` has been called successfully once.
self._successfully_fetched = False

@abstractmethod
def fetch(self) -> None:
"""
Retrieves the build artifact and source code and dumps it to a secure temporary location.
"Gretchen, stop trying to make fetch happen! It's not going to happen!" - Regina George
:raises FetchError: When the target artifact fails to be acquired.
"""

@abstractmethod
def get_path_to_source_code(self) -> Path:
"""
Returns the directory containing the artifact's bundled source code.
:raises FetchRequiredError: If a call to `fetch()` is required before using this function.
"""
43 changes: 43 additions & 0 deletions conda_recipe_manager/fetcher/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
:Description: Provides exceptions for fetching modules.
"""

from __future__ import annotations


class FetcherException(Exception):
"""
Base exception for all other artifact fetching exceptions. Should not be raised directly.
"""


class FetchError(FetcherException):
"""
General exception to be thrown when there is a failure to fetch an artifact.
"""

def __init__(self, message: str):
"""
Constructs a FetchError Exception.
:param message: String description of the issue encountered.
"""
self.message = message if len(message) else "An unknown error occurred while trying to fetch an artifact."
super().__init__(self.message)


class FetchRequiredError(FetcherException):
"""
This operation could not be performed because a call to `fetch()` has not yet succeeded.
"""

def __init__(self, message: str):
"""
Constructs a FetchRequiredError Exception.
:param message: String description of the issue encountered.
"""
self.message = (
message if len(message) else "An operation could not be completed as the artifact has not been fetched."
)
super().__init__(self.message)
43 changes: 43 additions & 0 deletions conda_recipe_manager/fetcher/git_artifact_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
:Description: Provides an Artifact Fetcher capable of acquiring source code from a remote git repository.
"""

from __future__ import annotations

from pathlib import Path

from conda_recipe_manager.fetcher.base_artifact_fetcher import BaseArtifactFetcher


class GitArtifactFetcher(BaseArtifactFetcher):
"""
Artifact Fetcher capable of cloning a remote git repository.
"""

def __init__(self, name: str, git_url: str):
"""
TODO
TODO add other params
"""
super().__init__(name)
self._git_url = git_url

def _clone(self) -> None:
"""
TODO
"""
pass

def fetch(self) -> None:
"""
TODO
"""
self._clone()

def get_path_to_source_code(self) -> Path:
"""
Returns the directory containing the artifact's bundled source code.
:raises FetchRequiredError: If a call to `fetch()` is required before using this function.
"""
return Path()
156 changes: 156 additions & 0 deletions conda_recipe_manager/fetcher/http_artifact_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""
:Description: Provides an Artifact Fetcher capable of acquiring a software archive from an HTTP/HTTPS source.
"""

from __future__ import annotations

import tarfile
import zipfile
from enum import Enum, auto
from pathlib import Path
from typing import Final, Iterator, cast
from urllib.parse import urlparse

import requests

from conda_recipe_manager.fetcher.base_artifact_fetcher import BaseArtifactFetcher
from conda_recipe_manager.fetcher.exceptions import FetchError, FetchRequiredError
from conda_recipe_manager.utils.cryptography.hashing import hash_file

# Default download timeout for artifacts
_DOWNLOAD_TIMEOUT: Final[int] = 5 * 60 # 5 minutes


class ArtifactArchiveType(Enum):
"""
Enumerates the types of archive file formats that are supported.
"""

ZIP = auto()
# TODO determine how to do this in Python
ZIP_7 = auto() # 7zip
TARBALL = auto()
UNKNOWN = auto() # Could not determine the artifact type


class HttpArtifactFetcher(BaseArtifactFetcher):
"""
Artifact Fetcher capable of downloading a software archive from a remote HTTP/HTTPS source.
"""

def __init__(self, name: str, archive_url: str):
"""
Constructs an `HttpArtifactFetcher` instance.
:param name: Identifies the artifact. Ideally, this is the package name. In multi-sourced/mirrored scenarios,
this might be the package name combined with some identifying information.
:param archive_url: URL that points to the target software archive.
"""
super().__init__(name)
self._archive_url = archive_url
self._archive_type = ArtifactArchiveType.UNKNOWN

# We use `urlparse` to extract the file path containing the archive. This can be used to get the archive's file
# name. Many of the archive files we deal with contain the version number with period markings. We also work
# with archives with many different file extensions. To avoid the many pitfalls here of trying to calculate the
# "true basename" of the file, we just pre-pend `extracted_` to indicate this is the folder containing the
# extracted archive.
archive_file_name: Final[str] = Path(urlparse(self._archive_url).path).name
extracted_dir_name: Final[str] = f"extracted_{archive_file_name}"

self._archive_path: Final[Path] = self._temp_dir_path / archive_file_name
self._uncompressed_archive_path: Final[Path] = self._temp_dir_path / extracted_dir_name

def _fetch_guard(self, msg: str) -> None:
"""
Convenience function that prevents executing functions that require the archive to be downloaded.
:param msg: Message to attach to the exception.
:raises FetchRequiredError: If `fetch()` has not been successfully invoked.
"""
if self._successfully_fetched:
return
raise FetchRequiredError(msg)

def _extract(self) -> None:
"""
Retrieves the build artifact and source code and dumps it to a secure temporary location.
:raises FetchError: If an issue occurred while extracting the archive.
"""
try:
match self._archive_path:
case path if tarfile.is_tarfile(path):
self._archive_type = ArtifactArchiveType.TARBALL
with tarfile.open(self._archive_path, mode="r") as tar_file:
# The `filter="data"` parameter guards against "the most dangerous security issues"
tar_file.extractall(path=self._uncompressed_archive_path, filter="data")
case path if zipfile.is_zipfile(path):
self._archive_type = ArtifactArchiveType.ZIP
with zipfile.ZipFile(self._archive_path) as zip_file:
# TODO improve security checks
zip_file.extractall(path=self._uncompressed_archive_path)
# TODO 7-zip support
case _:
raise FetchError("The archive type could not be identified.")
except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e:
raise FetchError("An extraction error occurred while extracting the archive.") from e
except IOError as e:
raise FetchError("A file system error occurred while extracting the archive.") from e

def fetch(self) -> None:
"""
Retrieves a software archive from a remote HTTP/HTTPS host and stores the files in a secure temporary directory.
:raises FetchError: If an issue occurred while downloading or extracting the archive.
"""
# Buffered download approach
try:
response = requests.get(str(self._archive_url), stream=True, timeout=_DOWNLOAD_TIMEOUT)
with open(self._archive_path, "wb") as archive:
for chunk in cast(Iterator[bytes], response.iter_content(chunk_size=1024)):
if not chunk:
break
archive.write(chunk)
except requests.exceptions.RequestException as e: # type: ignore[misc]
raise FetchError("An HTTP error occurred while fetching the archive.") from e
except IOError as e:
raise FetchError("A file system error occurred while fetching the archive.") from e

self._extract()

# If we have not thrown at this point, we have successfully fetched the archive.
self._successfully_fetched = True

def get_path_to_source_code(self) -> Path:
"""
Returns the directory containing the artifact's bundled source code.
NOTE: If the target archive compresses top-level folder that contains the source code, this path will point to a
directory containing that uncompressed top-level folder.
:raises FetchRequiredError: If `fetch()` has not been successfully invoked.
"""
self._fetch_guard("Archive has not been downloaded, so the source code is unavailable.")

return self._uncompressed_archive_path

def get_archive_sha256(self) -> str:
"""
Calculates a SHA-256 hash on the downloaded software archive.
:raises FetchRequiredError: If `fetch()` has not been successfully invoked.
"""
self._fetch_guard("Archive has not been downloaded, so the file can't be hashed.")

return hash_file(self._archive_path, "sha256")

def get_archive_type(self) -> ArtifactArchiveType:
"""
Returns the type of archive that was retrieved. This evaluation was determined by evaluating the file and not by
the file name.
:raises FetchRequiredError: If `fetch()` has not been successfully invoked.
"""
self._fetch_guard("Archive has not been downloaded, so the type can't be determined.")

return self._archive_type
5 changes: 2 additions & 3 deletions conda_recipe_manager/parser/recipe_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from conda_recipe_manager.parser.enums import SchemaVersion
from conda_recipe_manager.parser.types import TAB_AS_SPACES, TAB_SPACE_COUNT, MultilineVariant
from conda_recipe_manager.types import PRIMITIVES_TUPLE, JsonType, Primitives, SentinelType
from conda_recipe_manager.utils.cryptography.hashing import hash_str

# Import guard: Fallback to `SafeLoader` if `CSafeLoader` isn't available
try:
Expand Down Expand Up @@ -1130,6 +1131,4 @@ def calc_sha256(self) -> str:
:returns: SHA-256 hash of the current recipe state.
"""
# NOTE: If we need to hash larger recipes, we may want to consider a buffered
# approach: https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
return hashlib.sha256(self.render().encode()).hexdigest()
return hash_str(self.render(), hashlib.sha256)
Empty file.
Empty file.
Loading

0 comments on commit aa15b12

Please sign in to comment.