Skip to content

Commit

Permalink
provenance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
mwigham committed May 6, 2024
1 parent 473fb57 commit 90b975c
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 6 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ RUN poetry install --no-ansi --without dev --no-root && rm -rf $POETRY_CACHE_DIR

# Write provenance info about software versions to file
RUN echo "dane-video-segmentation-worker;https://github.com/beeldengeluid/dane-video-segmentation-worker/commit/$(git rev-parse HEAD)" >> /software_provenance.txt
RUN echo "scenedetect;$(poetry show scenedetect | grep ' version .*' | cut --delimiter=: --fields=2 | cut --delimiter=' ' --fields=2)" >> /software_provenance.txt

COPY . /src

Expand Down
12 changes: 8 additions & 4 deletions io_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def http_download(url: str) -> Optional[DownloadResult]:
response = requests.get(url)
file.write(response.content)
file.close()
download_time = time.time() - start_time
download_time = (time.time() - start_time) * 1000 # time in ms
return DownloadResult(
output_file, # NOTE or output_file? hmmm
download_time, # TODO add mime_type and content_length
Expand Down Expand Up @@ -279,13 +279,17 @@ def s3_download(s3_uri: str) -> Optional[DownloadResult]:


def to_download_provenance(
download_result: DownloadResult, input_file_path: str, start_time: float
download_result: DownloadResult,
input_file_path: str,
start_time: float,
software_version: str,
) -> Provenance:
return Provenance(
activity_name="Download VisXP input",
activity_description="Download source AV media",
start_time_unix=start_time, # TODO not supplied yet by download worker
processing_time_ms=download_result.download_time, # TODO not supllied yet by download worker
start_time_unix=start_time,
processing_time_ms=download_result.download_time,
software_version=software_version,
input_data={"input_file_path": input_file_path},
output_data={"file_path": download_result.file_path},
)
6 changes: 5 additions & 1 deletion main_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def run(
"optionally extract keyframes and/or corresponding audio spectrograms"
),
input_data={"input_file_path": input_file_path}, # TODO S3 URI!
start_time=time(),
parameters=dict(cfg.VISXP_PREP),
software_version=obtain_software_versions(DANE_WORKER_ID),
)
Expand All @@ -76,7 +77,10 @@ def run(
}, []
else:
download_provenance = to_download_provenance(
download_result, input_file_path, start_time=start_time
download_result,
input_file_path,
start_time=start_time,
software_version=top_level_provenance.software_version,
)
input_file_path = download_result.file_path if download_result else ""

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ yacs = "^0.1.8"
pika = "^1.3.2"
requests = "^2.31.0"
validators = "^0.22.0"
dane = "^0.4.2"
dane = { git = "https://github.com/CLARIAH/DANE" , branch = "ms_fix"}
scenedetect = { git = "https://github.com/Veldhoen/PySceneDetect" , tag = "v0.6.4-alpha", extras = ["opencv"]}


Expand Down

0 comments on commit 90b975c

Please sign in to comment.