From 90b975cd6ca6d117afef3569b33cf6ba0c9317c2 Mon Sep 17 00:00:00 2001 From: mwigham <38520885+mwigham@users.noreply.github.com> Date: Mon, 6 May 2024 14:06:24 +0200 Subject: [PATCH] provenance improvements --- Dockerfile | 1 + io_util.py | 12 ++++++++---- main_data_processor.py | 6 +++++- pyproject.toml | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 90226a1..01f960f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,7 @@ RUN poetry install --no-ansi --without dev --no-root && rm -rf $POETRY_CACHE_DIR # Write provenance info about software versions to file RUN echo "dane-video-segmentation-worker;https://github.com/beeldengeluid/dane-video-segmentation-worker/commit/$(git rev-parse HEAD)" >> /software_provenance.txt +RUN echo "scenedetect;$(poetry show scenedetect | grep ' version .*' | cut --delimiter=: --fields=2 | cut --delimiter=' ' --fields=2)" >> /software_provenance.txt COPY . /src diff --git a/io_util.py b/io_util.py index 70d13fa..cde0c48 100644 --- a/io_util.py +++ b/io_util.py @@ -240,7 +240,7 @@ def http_download(url: str) -> Optional[DownloadResult]: response = requests.get(url) file.write(response.content) file.close() - download_time = time.time() - start_time + download_time = (time.time() - start_time) * 1000 # time in ms return DownloadResult( output_file, # NOTE or output_file? hmmm download_time, # TODO add mime_type and content_length @@ -279,13 +279,17 @@ def s3_download(s3_uri: str) -> Optional[DownloadResult]: def to_download_provenance( - download_result: DownloadResult, input_file_path: str, start_time: float + download_result: DownloadResult, + input_file_path: str, + start_time: float, + software_version: str, ) -> Provenance: return Provenance( activity_name="Download VisXP input", activity_description="Download source AV media", - start_time_unix=start_time, # TODO not supplied yet by download worker - processing_time_ms=download_result.download_time, # TODO not supllied yet by download worker + start_time_unix=start_time, + processing_time_ms=download_result.download_time, + software_version=software_version, input_data={"input_file_path": input_file_path}, output_data={"file_path": download_result.file_path}, ) diff --git a/main_data_processor.py b/main_data_processor.py index abfba1d..cc10a9e 100644 --- a/main_data_processor.py +++ b/main_data_processor.py @@ -57,6 +57,7 @@ def run( "optionally extract keyframes and/or corresponding audio spectrograms" ), input_data={"input_file_path": input_file_path}, # TODO S3 URI! + start_time=time(), parameters=dict(cfg.VISXP_PREP), software_version=obtain_software_versions(DANE_WORKER_ID), ) @@ -76,7 +77,10 @@ def run( }, [] else: download_provenance = to_download_provenance( - download_result, input_file_path, start_time=start_time + download_result, + input_file_path, + start_time=start_time, + software_version=top_level_provenance.software_version, ) input_file_path = download_result.file_path if download_result else "" diff --git a/pyproject.toml b/pyproject.toml index 963debf..53ff302 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ yacs = "^0.1.8" pika = "^1.3.2" requests = "^2.31.0" validators = "^0.22.0" -dane = "^0.4.2" +dane = { git = "https://github.com/CLARIAH/DANE" , branch = "ms_fix"} scenedetect = { git = "https://github.com/Veldhoen/PySceneDetect" , tag = "v0.6.4-alpha", extras = ["opencv"]}