Merge pull request #1595 from pyiron/restructure_archive

[patch] Restructure archive
pyiron · Aug 14, 2024 · 6658f77 · 6658f77
2 parents 120614f + 6ff170d
commit 6658f77
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 63 deletions.
diff --git a/pyiron_base/project/archiving/export_archive.py b/pyiron_base/project/archiving/export_archive.py
@@ -6,37 +6,6 @@
 from pyiron_base.project.archiving.shared import getdir
 
 
-def update_project(project_instance, directory_to_transfer, archive_directory, df):
-    """
-    Update the project paths in a DataFrame to reflect the new archive location.
-
-    Args:
-        project_instance (Project): The project instance for accessing project properties.
-        directory_to_transfer (str): The directory containing the jobs to be transferred.
-        archive_directory (str): The base directory for the archive.
-        df (DataFrame): DataFrame containing job information, including project paths.
-
-    Returns:
-        list: List of updated project paths reflecting the new archive location.
-    """
-    dir_name_transfer = getdir(path=directory_to_transfer)
-    dir_name_archive = getdir(path=archive_directory)
-
-    pr_transfer = project_instance.open(os.curdir)
-    path_rel_lst = [
-        os.path.relpath(p, pr_transfer.project_path) for p in df["project"].values
-    ]
-
-    return [
-        (
-            os.path.join(dir_name_archive, dir_name_transfer, p)
-            if p != "."
-            else os.path.join(dir_name_archive, dir_name_transfer)
-        )
-        for p in path_rel_lst
-    ]
-
-
 def copy_files_to_archive(
     directory_to_transfer,
     archive_directory,
@@ -52,6 +21,7 @@ def copy_files_to_archive(
         archive_directory (str): The destination directory for the archive.
         compress (bool): If True, compress the archive directory into a tarball. Default is True.
         copy_all_files (bool): If True, include all files in the archive, otherwise only .h5 files. Default is False.
+        arcname (str): The name of the archive directory. Default is the name of the directory to transfer.
 
     """
 
@@ -75,20 +45,16 @@ def copy_files(origin, destination, copy_all_files=copy_all_files):
         arcname = os.path.relpath(os.path.abspath(archive_directory), os.getcwd())
     dir_name_transfer = getdir(path=directory_to_transfer)
     if not compress:
-        copy_files(
-            directory_to_transfer, os.path.join(archive_directory, dir_name_transfer)
-        )
+        copy_files(directory_to_transfer, os.path.join(archive_directory, arcname))
     else:
         with tempfile.TemporaryDirectory() as temp_dir:
             # Copy files to the temporary directory
-            copy_files(directory_to_transfer, os.path.join(temp_dir, dir_name_transfer))
+            dest = os.path.join(temp_dir, dir_name_transfer)
+            copy_files(directory_to_transfer, dest)
 
             # Compress the temporary directory into a tar.gz archive
             with tarfile.open(f"{archive_directory}.tar.gz", "w:gz") as tar:
-                tar.add(
-                    temp_dir,
-                    arcname=arcname,
-                )
+                tar.add(dest, arcname=arcname)
 
 
 def copy_h5_files(src, dst):
@@ -115,20 +81,16 @@ def copy_h5_files(src, dst):
                 shutil.copy2(src_file, os.path.join(dst_dir, file))
 
 
-def export_database(pr, directory_to_transfer, archive_directory):
+def export_database(pr):
     """
     Export the project database to an archive directory.
 
     Args:
         pr (Project): The project instance containing the jobs.
-        directory_to_transfer (str): The directory containing the jobs to transfer.
-        archive_directory (str): The destination directory for the archive.
 
     Returns:
         DataFrame: DataFrame containing updated job information with new IDs and project paths.
     """
-    assert isinstance(archive_directory, str) and ".tar.gz" not in archive_directory
-    directory_to_transfer = os.path.basename(directory_to_transfer)
 
     df = pr.job_table()
     job_translate_dict = {
@@ -138,7 +100,7 @@ def export_database(pr, directory_to_transfer, archive_directory):
     df["id"] = df["id"].map(job_translate_dict)
     df["masterid"] = df["masterid"].map(job_translate_dict)
     df["parentid"] = df["parentid"].map(job_translate_dict)
-    df["project"] = update_project(pr, directory_to_transfer, archive_directory, df)
+    df["project"] = df["project"].map(lambda x: os.path.relpath(x, os.getcwd()))
 
     df.drop(columns=["projectpath"], inplace=True)
     return df
diff --git a/pyiron_base/project/archiving/import_archive.py b/pyiron_base/project/archiving/import_archive.py
@@ -50,22 +50,24 @@ def import_jobs(project_instance, archive_directory, df, compressed=True):
             does not have the correct format paths
             as string or pyiron Project objects are expected"""
         )
+    common_path = os.path.commonpath(list(df["project"]))
+    os.makedirs(archive_directory, exist_ok=True)
     if compressed:
         with tarfile.open(archive_directory + ".tar.gz", "r:gz") as tar:
-            tar.extractall()
+            tar.extractall(path=archive_directory)
 
     # source folder; archive folder
-    src = os.path.abspath(archive_directory)
+    src = os.path.abspath(os.path.join(archive_directory, common_path))
     copytree(src, project_instance.path, dirs_exist_ok=True)
     if compressed:
-        rmtree(src)
+        rmtree(os.path.abspath(archive_directory))
 
     # # Update Database
     pr_import = project_instance.open(os.curdir)
 
     df["project"] = [
-        os.path.join(
-            pr_import.project_path, os.path.relpath(p, getdir(path=archive_directory))
+        os.path.normpath(
+            os.path.join(pr_import.project_path, os.path.relpath(p, common_path))
         )
         + "/"
         for p in df["project"].values

diff --git a/pyiron_base/project/generic.py b/pyiron_base/project/generic.py
@@ -1957,8 +1957,6 @@ def pack(
         """
         if destination_path is None:
             destination_path = self.path
-        if os.path.isabs(destination_path):
-            destination_path = os.path.relpath(destination_path, os.getcwd())
         if ".tar.gz" in destination_path:
             destination_path = destination_path.split(".tar.gz")[0]
             compress = True
@@ -1971,18 +1969,16 @@ def pack(
         )
         if destination_path_abs == directory_to_transfer and not compress:
             raise ValueError(
-                "The destination_path cannot have the same name as the project to compress."
+                "The destination_path cannot have the same name as the project."
             )
         export_archive.copy_files_to_archive(
             directory_to_transfer,
             destination_path_abs,
             compress=compress,
             copy_all_files=copy_all_files,
-            arcname=destination_path,
-        )
-        df = export_archive.export_database(
-            self, directory_to_transfer, destination_path_abs
+            arcname=os.path.relpath(self.path, os.getcwd()),
         )
+        df = export_archive.export_database(self)
         df.to_csv(csv_file_path)
 
     def unpack(self, origin_path, csv_file_name="export.csv", compress=True):

diff --git a/tests/unit/archiving/test_export.py b/tests/unit/archiving/test_export.py
@@ -38,9 +38,7 @@ def test_exportedCSV(self):
         # in the first test, the csv file from the packing function is read
         # and is compared with the return dataframe from export_database
         directory_to_transfer = os.path.basename(self.pr.path[:-1])
-        df_exp = export_database(
-            self.pr, directory_to_transfer, "archive_folder"
-        ).dropna(axis=1)
+        df_exp = export_database(self.pr).dropna(axis=1)
         df_exp["hamversion"] = float(df_exp["hamversion"])
         self.assertEqual(df_exp["job"].unique()[0], "toy")
         self.assertEqual(df_exp["id"].unique()[0], 0)

diff --git a/tests/unit/archiving/test_import.py b/tests/unit/archiving/test_import.py
@@ -173,9 +173,8 @@ def test_import_with_targz_extension(self):
         pr_imp.unpack(
             origin_path=pack_path_comp, csv_file_name=pack_path_csv, compress=True
         )
-        # here the 7 is the length of '.tar.gz' string
-        with tarfile.open(pack_path_comp[:-7] + ".tar.gz", "r:gz") as tar:
-            tar.extractall()
+        with tarfile.open(pack_path_comp, "r:gz") as tar:
+            tar.extractall(path=pack_path_comp[: -len(".tar.gz")])
         compare_obj = dircmp(pack_path_comp[:-7], pr_imp.path)
         self.assertEqual(len(compare_obj.diff_files), 0)
         pr.remove(enable=True)