Skip to content

Commit

Permalink
🐛 Force downloading locally artifacts from a run_id af ignore to spec…
Browse files Browse the repository at this point in the history
…ified filepath (#362)
  • Loading branch information
Galileo-Galilei committed Oct 5, 2022
1 parent 99065d4 commit bb409f1
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

### Fixed

- :bug: `MlflowArtifactDataSet.load()` now correctly loads the artifact when both `artifact_path` and `run_id` arguments are specified. Previous fix in ``0.11.4`` did not work because when the file already exist locally, mlflow did not download it again so tests were incorrectly passing. ([#362](https://github.com/Galileo-Galilei/kedro-mlflow/issues/362))

## [0.11.4] - 2022-10-04

### Fixed
Expand Down
22 changes: 18 additions & 4 deletions kedro_mlflow/io/artifacts/mlflow_artifact_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
from pathlib import Path
from typing import Any, Dict, Union

Expand Down Expand Up @@ -95,18 +96,31 @@ def _load(self) -> Any: # pragma: no cover
# special datasets with a folder instead of a specifi files like PartitionedDataSet
local_path = Path(self._path)

# BEWARE: we must enforce Path(local_path) because it is a PurePosixPath which fails on windows
# this is very weird: if you assign the value, it is converted to a Pureposixpath again, e.g:
# this fails:
# local_path = Path(local_path)
# local_path.name # local_path has been converted back to PurePosixPath on windows on this 2nd row
# but this works as a one liner::
# filename = Path(local_path).name

filename = Path(local_path).name
artifact_path = (
(self.artifact_path / Path(local_path.name)).as_posix()
(self.artifact_path / Path(filename)).as_posix()
if self.artifact_path
else local_path.name
else filename
)

mlflow_client.download_artifacts(
# we cannot use dst_path, because it downlaods teh file to "local_path / artifact_path /filename.pkl"
# the artifact_path suffix prevents from loading when we call super._load()
temp_download_filepath = mlflow_client.download_artifacts(
run_id=self.run_id,
path=artifact_path,
dst_path=local_path.parent.as_posix(), # must be a **local** **directory**
# dst_path=local_path.parent.as_posix(),
)

shutil.copy(src=temp_download_filepath, dst=local_path)

# finally, read locally
return super()._load()

Expand Down
7 changes: 6 additions & 1 deletion tests/io/artifacts/test_mlflow_artifact_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,9 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path(
with mlflow.start_run():
mlflow_csv_dataset1.save(df1)
first_run_id = mlflow.active_run().info.run_id

(
tmp_path / "df1.csv"
).unlink() # we need to delete the data, else it is automatically reused instead of downloading
# same as before, but a closed run_id is specified
mlflow_csv_dataset2 = MlflowArtifactDataSet(
data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()),
Expand All @@ -252,6 +254,9 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path(
)

assert df1.equals(mlflow_csv_dataset2.load())
assert df1.equals(
mlflow_csv_dataset2.load()
) # we check idempotency. When using a local tracking uri, mlflow does not downlaod in a tempfolder, so moving the folder alter the mlflow run


@pytest.mark.parametrize("artifact_path", [None, "partitioned_data"])
Expand Down

0 comments on commit bb409f1

Please sign in to comment.