-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: miguel <miguel.brandao@ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: miguel <miguel.brandao@ibm.com>
- Loading branch information
1 parent
e183c6c
commit 77379c2
Showing
9 changed files
with
372 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<mxfile host="Electron" modified="2023-06-05T12:07:18.417Z" agent="5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.3.0 Chrome/104.0.5112.114 Electron/20.1.3 Safari/537.36" etag="JlqAnVsnxsl_rzqtyEW0" version="20.3.0" type="device"><diagram id="uD-RMPnPRatl6ZgYXZys" name="Page-1">7Vtdc5s4FP01fowHJMD4MY7T7uy0s53NdLZ92pFBGDYYuSDHTn/9SiBhkIgTmw87H+5MAxe4hnvOPbrSxSN4s9p9TtE6/Ep8HI+A4e9GcD4CABiOw/5wy2NhMdm/wrJMI1/Y9oa76DcWRkNYN5GPs9qJlJCYRuu60SNJgj1as6E0Jdv6aQGJ69+6RkusGe48FOvWfyKfhsJqOtP9gT9wtAzFV7tgUhxYIXmyeJIsRD7ZVkzwdgRvUkJosbXa3eCYR0/Gpbju0xNHyxtLcUJfcsGvv//c7JzvP0D833cw+UR3XzfRlfDygOKNeGDmICCE/R8lPt5xd8Xd00cZkpRs2CHu1RjB2TaMKL5bI48f3TIWMFtIVzHbM9nmA05pxMJ5HUfLhNko4Scgseexe8cpM8RogeMZ8u6XufcbEhNmnickYW5nQRTH0jQC0Lex61vMntGU3OPKERcsIKMcnOnBkU/K7gfvKiYRrM+YrDBNH9kpkrzQLi4R1J0IHLcVGtjCFlYYYE6FEQnqLUvXe3TYhgDoCLCABhZKaRAhj/6LhgJKgSIIAuB5TVD4zsKxu4JCxl6qiAE0LNwGKJy+kIBNSEQBR8J7X0g47nmRkONEXcEWKG2nYKdhcB5hgqZ5bmUy9XGkTIjFkAicJwugpSMwbBY0Dg2tBOk1xd9qyIBh42/p8U/yCrQAgV9fiJGCA3tgWld/LcIqEKvI9/nlsxRn0W+0yF1xFNckSmj+ZPZsZM+5rw0lWVFSmxomDbWVMHWBkF3XKKuheDIbEIK9IeRoCPkYrzOMUi+84lOK+4i2G7c7CJtt1InN4ja2tchNB6X2pGGAVcKEE/+az7U4bWOUZZFXj0w9jHgX0R+V7Z98mz1msTffVQ7NH8WOj7Iwd8Ddsfimjz+qOxUXfHfvI9+TTp4EKCOb1MMvYA9F6RLTQycW52G/NqvU8a6g2TRSS1uKY0Sjh/pctAli8Q3fuATs6QSVLISuwpLiwcVV1bmj4siCCi+BU3dUBEZzlDOufOzTSejoBXeKkZ+NgBNz/Vywcs9Z8i32TchHFOlHgpSsdKvOZQbdFz4b7U+WA5JQsdLBBu5ivzJSGvmnka+HM1SVmXJVRNzNqLrw0CQ/xtgwp2476slTSBBkuBcymG7ninSioLRWsiEUybgoSbKtuiSVE5WjJWmq1uCKo54lCTQtnSkszEK05pveY8zLvxQ+X0QsCmJ+WZSGclHsrw1lbspiTgiI3U3l4dSjOdULaqeBFOp40t1Slz6haZnj9QrihMQ9WST43jecRiwyfORoWYqAF+Y9uKxSBCiVLZyA0/K+nEpIR0Bx1Hfe66XIG8p7AJ0zJ74+k35D4YUOOHN49Wnwh65Wl/Sf11V4UbpqmXWGWdaJumqpVJ0OrKudrzP0xcwDvLTBnrbdUVOubr46bqrL5FBZNXgpN20Djp3p/uPW3brOGFqVw8PytvPZ6FtRVNmUeJ621kXR1lErzJNpazFiGvsPuCDaQr2MHY/HGnPfbH9iqvQnGvrYsIFivfUnoF73vmc8gCwHzoaH/bysH9UdOu4NJ4TdoLGN6nguXgTdBH0CJ7Wgmw1Bb+zSAXfSV9j1KhAlhIaM2e+5m6qMSMBteOPDGjQ99KqnfOGg/btoR+XK7Pbmej47mCvN7zJ0kUGWkkGWpQHT1K2d9IWLnP31XI3K7aOq0XLnYrq1UuQvpPBUB0Hz1Lm8qbRrTUu5l77btXox45NtEpO8ZcvuzUGr/G2iVZGHalOWyXjKlCTLhZ4SncGvrkdb5mXrHu2VMTaAUZ8Bg3b0679la7/v2cZkUh8moAu1YWLQ6tbWE1TkHJP6SjpWwHF+bQgViXBVhPGanWAa612Rw+K4zGFGa4oT/ZWqi8T4qezOVUDcO1BU4eAYdQQ3ZEtAarVsjD3TIlDb3d1xQ28RtH2X9y2UdmyUP29pZ+tTo4+cvYycBbau58Pm7IHpWPsfpLzanIWGvqAxbM5OP3L2QnMWQl3PB81ZeT8HfjPgIS/ErwPZLhBSliOnL1yNPKFKZrv7H94Wk5z975fh7f8=</diagram></mxfile> |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# Artifact Management | ||
|
||
The Deep Search Toolkit provides artifact management capabilities—both via its Python SDK | ||
and via the CLI. | ||
|
||
<img src=".resources/artifact_management.png" alt="artifact management design" /> | ||
|
||
- Artifact information is organized in an *artifact index*, which is a directory | ||
containing one subdirectory for each artifact (multiple artifact indexes are possible) | ||
- Each artifact subdir contains a JSON file with metadata for the artifact (for filename | ||
check [artifact_manager.py](artifact_manager.py)) | ||
- The name of each such subdir is used as the *artifact name* | ||
- The JSON file contains a key with the URL to the actual artifact data (for key name | ||
check [artifact_manager.py](artifact_manager.py)) | ||
- Users can download indexed artifacts to a local *artifact cache* for downstream | ||
processing | ||
|
||
## Usage | ||
|
||
### Usage with Python SDK | ||
```python | ||
from deepsearch.artifacts.artifact_manager import ArtifactManager | ||
|
||
artf_mgr = ArtifactManager( | ||
index="/foo/index", # set accordingly | ||
) | ||
|
||
artifacts_in_index = artf_mgr.get_artifacts_in_index() | ||
print(artifacts_in_index) | ||
# output -> ['artifact_c', 'artifact_a', 'artifact_b', 'artifact_d'] | ||
|
||
print(artf_mgr.get_artifacts_in_cache()) | ||
# output -> [] | ||
|
||
for artf_name in artifacts_in_index: | ||
artf_mgr.download_artifact_to_cache( | ||
artifact_name=artf_name, | ||
hit_strategy=ArtifactManager.HitStrategy.OVERWRITE, | ||
unpack_archives=True, | ||
with_progress_bar=False, | ||
) | ||
|
||
print(artf_mgr.get_artifacts_in_cache()) | ||
# output -> ['artifact_c', 'artifact_a', 'artifact_b', 'artifact_d'] | ||
|
||
print(artf_mgr.get_artifact_path_in_cache(artifact_name=artifacts_in_index[0])) | ||
# output -> /Users/pva/Library/Caches/deepsearch/artifact_cache/artifact_c | ||
|
||
print(artf_mgr.get_cache_path()) | ||
# output -> /Users/pva/Library/Caches/deepsearch/artifact_cache | ||
``` | ||
|
||
### Usage with CLI | ||
```console | ||
$ deepsearch artifacts --help | ||
Usage: deepsearch artifacts [OPTIONS] COMMAND [ARGS]... | ||
|
||
Manage artifacts | ||
|
||
Options: | ||
--help Show this message and exit. | ||
|
||
Commands: | ||
download Download an artifact to cache | ||
download-all Download all artifacts to cache | ||
list-cache List artifacts in cache | ||
list-index List artifacts in index | ||
locate-cached-artifact Show path of a cached artifact | ||
locate-default-cache Show cache path | ||
``` | ||
|
||
### Environment variables | ||
|
||
Environment variables can be used for overriding internal defaults—for the latest status, | ||
check [artifact_manager.py](artifact_manager.py). | ||
|
||
- `DEEPSEARCH_ARTIFACT_INDEX`: default index path | ||
- `DEEPSEARCH_ARTIFACT_CACHE`: default cache path | ||
- `DEEPSEARCH_ARTIFACT_META_FILENAME`: name of JSON metadata file | ||
- `DEEPSEARCH_ARTIFACT_URL_FIELD`: field for download URL within JSON metadata file |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
import json | ||
import os | ||
import shutil | ||
import tempfile | ||
from enum import Enum | ||
from pathlib import Path | ||
from typing import Dict, List | ||
from urllib.parse import urlparse | ||
|
||
import platformdirs | ||
import requests | ||
from tqdm import tqdm | ||
|
||
DFLT_ARTFCT_INDEX_DIR = os.getenv("DEEPSEARCH_ARTIFACT_INDEX", default=os.getcwd()) | ||
DFLT_ARTFCT_CACHE_DIR = os.getenv( | ||
"DEEPSEARCH_ARTIFACT_CACHE", | ||
default=Path(platformdirs.user_cache_dir("deepsearch", "ibm")) / "artifact_cache", | ||
) | ||
ARTF_META_FILENAME = os.getenv("DEEPSEARCH_ARTIFACT_META_FILENAME", default="meta.info") | ||
ARTF_META_URL_FIELD = os.getenv("DEEPSEARCH_ARTIFACT_URL_FIELD", default="static_url") | ||
|
||
|
||
class ArtifactManager: | ||
class HitStrategy(str, Enum): | ||
RAISE = "raise" | ||
PASS = "pass" | ||
OVERWRITE = "overwrite" | ||
|
||
def __init__(self, index=None, cache=None): | ||
self._index_path = Path(index or DFLT_ARTFCT_INDEX_DIR) | ||
self._cache_path = Path(cache or DFLT_ARTFCT_CACHE_DIR) | ||
self._cache_path.mkdir(parents=True, exist_ok=True) | ||
|
||
def get_cache_path(self) -> Path: | ||
return self._cache_path | ||
|
||
def get_index_path(self) -> Path: | ||
return self._index_path | ||
|
||
def get_artifact_path_in_cache(self, artifact_name: str) -> Path: | ||
artifact_path = self._cache_path / artifact_name | ||
if not artifact_path.exists(): | ||
raise FileNotFoundError(f'Artifact "{artifact_name}" not in cache') | ||
return artifact_path | ||
|
||
def download_artifact_to_cache( | ||
self, | ||
artifact_name: str, | ||
unpack_archives: bool = True, | ||
hit_strategy: HitStrategy = HitStrategy.OVERWRITE, | ||
with_progress_bar: bool = False, | ||
) -> None: | ||
artifact_path = self._cache_path / artifact_name | ||
if artifact_path.exists(): | ||
if hit_strategy == self.HitStrategy.RAISE: | ||
raise ValueError(f'Artifact "{artifact_name}" already in cache') | ||
elif hit_strategy == self.HitStrategy.PASS: | ||
return | ||
elif hit_strategy == self.HitStrategy.OVERWRITE: | ||
shutil.rmtree(artifact_path) | ||
else: | ||
raise RuntimeError(f'Unexcpected value "{hit_strategy=}"') | ||
|
||
artifact_path.mkdir(exist_ok=False) | ||
|
||
# read metadata from file | ||
meta_path = self._index_path / artifact_name / ARTF_META_FILENAME | ||
with open(meta_path, "r") as meta_file: | ||
artifact_meta = json.load(meta_file) | ||
download_url = artifact_meta[ARTF_META_URL_FIELD] | ||
|
||
with tempfile.TemporaryDirectory() as temp_dir: | ||
download_path = self._download_file( | ||
artifact_name=artifact_name, | ||
download_url=download_url, | ||
download_root_path=Path(temp_dir), | ||
with_progress_bar=with_progress_bar, | ||
) | ||
self._finalize_download( | ||
download_path=download_path, | ||
target_path=artifact_path, | ||
unpack_archives=unpack_archives, | ||
) | ||
|
||
def get_artifacts_in_index(self) -> List[str]: | ||
artifacts = [] | ||
for entry in os.scandir(self._index_path): | ||
artifact_name = entry.name | ||
meta_file_path = self._index_path / artifact_name / ARTF_META_FILENAME | ||
if meta_file_path.exists(): | ||
artifacts.append(artifact_name) | ||
return artifacts | ||
|
||
def get_artifacts_in_cache(self) -> List[str]: | ||
artifacts = [] | ||
for entry in os.scandir(self._cache_path): | ||
artifact_name = entry.name | ||
artifact_path = self._cache_path / artifact_name | ||
if artifact_path.exists(): | ||
artifacts.append(artifact_name) | ||
return artifacts | ||
|
||
def _download_file( | ||
self, | ||
artifact_name: str, | ||
download_url: str, | ||
download_root_path: Path, | ||
with_progress_bar: bool, | ||
) -> Path: | ||
response = requests.get(download_url, stream=True) | ||
response.raise_for_status() | ||
|
||
dl_filename = None | ||
|
||
# try to get filename from response header | ||
cont_disposition = response.headers.get("Content-Disposition") | ||
if cont_disposition: | ||
disp_params = cont_disposition.strip().split(";") | ||
for par in disp_params: | ||
split_param = par.split("=") | ||
# currently only handling directive "filename" (not "*filename") | ||
if len(split_param) > 0 and split_param[0].strip() == "filename": | ||
dl_filename = "=".join(split_param[1:]).strip().strip("'\"") | ||
break | ||
|
||
# otherwise, use name from URL: | ||
if dl_filename is None: | ||
parsed_url = urlparse(download_url) | ||
dl_filename = Path(parsed_url.path).name | ||
|
||
total_size = int(response.headers.get("content-length", 0)) | ||
block_size = 1024 # 1 KB | ||
if with_progress_bar: | ||
progress_bar = tqdm(total=total_size, unit="B", unit_scale=True) | ||
progress_bar.set_description( | ||
f'Downloading "{dl_filename}" ("{artifact_name}")' | ||
) | ||
|
||
download_path = download_root_path / dl_filename | ||
with open(download_path, "wb") as file: | ||
for data in response.iter_content(block_size): | ||
file.write(data) | ||
if with_progress_bar: | ||
progress_bar.update(len(data)) | ||
|
||
if with_progress_bar: | ||
progress_bar.close() | ||
|
||
return download_path | ||
|
||
def _finalize_download( | ||
self, | ||
download_path: Path, | ||
target_path: Path, | ||
unpack_archives: bool = True, | ||
) -> None: | ||
|
||
dl_filename = download_path.name | ||
dl_path_str = str(download_path.resolve()) | ||
attempt_unpack = False | ||
if unpack_archives: | ||
unpack_formats = shutil.get_unpack_formats() | ||
unpack_extensions = [ | ||
e for unpk_frmt in unpack_formats for e in unpk_frmt[1] | ||
] | ||
for ext in unpack_extensions: | ||
if dl_filename.endswith(ext): | ||
attempt_unpack = True | ||
|
||
if attempt_unpack: | ||
shutil.unpack_archive(dl_path_str, target_path) | ||
else: | ||
shutil.move(dl_path_str, target_path / "") | ||
|
||
def _get_artifact_meta(self, artifact_name: str) -> Dict: | ||
file_path = self._index_path / artifact_name / ARTF_META_FILENAME | ||
if not file_path.exists(): | ||
raise FileNotFoundError(f'File "{file_path}" does not exist') | ||
with open(file_path, "r") as file: | ||
meta_info = json.load(file) | ||
return meta_info |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import typer | ||
|
||
from deepsearch.artifacts.artifact_manager import ( | ||
DFLT_ARTFCT_CACHE_DIR, | ||
DFLT_ARTFCT_INDEX_DIR, | ||
ArtifactManager, | ||
) | ||
|
||
app = typer.Typer(no_args_is_help=True, add_completion=False) | ||
|
||
INDEX_OPTION = typer.Option( | ||
None, | ||
"--index", | ||
"-i", | ||
help="Artifact index path (default set via env var DEEPSEARCH_ARTIFACT_INDEX, else current working dir).", | ||
) | ||
|
||
CACHE_OPTION = typer.Option( | ||
None, | ||
"--cache", | ||
"-c", | ||
help="Artifact cache path (default set via env var DEEPSEARCH_ARTIFACT_CACHE, else platform-specific).", | ||
) | ||
|
||
HIT_STRATEGY_OPTION = typer.Option( | ||
ArtifactManager.HitStrategy.OVERWRITE, | ||
"--hit-strategy", | ||
"-s", | ||
help="How to handle case of artifact being already in cache.", | ||
) | ||
|
||
|
||
@app.command(help="List artifacts in index") | ||
def list_index( | ||
index: str = INDEX_OPTION, | ||
): | ||
artf_mgr = ArtifactManager(index=index) | ||
artifacts = artf_mgr.get_artifacts_in_index() | ||
for artf in artifacts: | ||
typer.echo(artf) | ||
|
||
|
||
@app.command(help="List artifacts in cache") | ||
def list_cache( | ||
cache: str = CACHE_OPTION, | ||
): | ||
artf_mgr = ArtifactManager(cache=cache) | ||
artifacts = artf_mgr.get_artifacts_in_cache() | ||
for artf in artifacts: | ||
typer.echo(artf) | ||
|
||
|
||
@app.command(help="Show cache path") | ||
def locate_default_cache(): | ||
artf_mgr = ArtifactManager() | ||
path_str = str(artf_mgr.get_cache_path().resolve()) | ||
typer.echo(path_str) | ||
|
||
|
||
@app.command(help="Show path of a cached artifact") | ||
def locate_cached_artifact( | ||
artifact_name: str, | ||
cache: str = CACHE_OPTION, | ||
): | ||
artf_mgr = ArtifactManager(cache=cache) | ||
artf_path = artf_mgr.get_artifact_path_in_cache(artifact_name=artifact_name) | ||
artifact_path_str = str(artf_path.resolve()) | ||
typer.echo(artifact_path_str) | ||
|
||
|
||
@app.command(help="Download an artifact to cache") | ||
def download( | ||
artifact_name: str, | ||
index: str = INDEX_OPTION, | ||
cache: str = CACHE_OPTION, | ||
hit_strategy: ArtifactManager.HitStrategy = HIT_STRATEGY_OPTION, | ||
unpack: bool = typer.Option(True), | ||
progress_bar: bool = typer.Option(True), | ||
): | ||
artf_mgr = ArtifactManager(index=index, cache=cache) | ||
artf_mgr.download_artifact_to_cache( | ||
artifact_name=artifact_name, | ||
unpack_archives=unpack, | ||
hit_strategy=hit_strategy, | ||
with_progress_bar=progress_bar, | ||
) | ||
|
||
|
||
@app.command(help="Download all artifacts to cache") | ||
def download_all( | ||
index: str = INDEX_OPTION, | ||
cache: str = CACHE_OPTION, | ||
hit_strategy: ArtifactManager.HitStrategy = HIT_STRATEGY_OPTION, | ||
unpack: bool = typer.Option(True), | ||
progress_bar: bool = typer.Option(True), | ||
): | ||
artf_mgr = ArtifactManager(index=index, cache=cache) | ||
for artf_name in artf_mgr.get_artifacts_in_index(): | ||
artf_mgr.download_artifact_to_cache( | ||
artifact_name=artf_name, | ||
unpack_archives=unpack, | ||
hit_strategy=hit_strategy, | ||
with_progress_bar=progress_bar, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.