Skip to content

Commit

Permalink
Merge pull request #1595 from pyiron/restructure_archive
Browse files Browse the repository at this point in the history
[patch] Restructure archive
  • Loading branch information
samwaseda authored Aug 14, 2024
2 parents 120614f + 6ff170d commit 6658f77
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 63 deletions.
52 changes: 7 additions & 45 deletions pyiron_base/project/archiving/export_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,6 @@
from pyiron_base.project.archiving.shared import getdir


def update_project(project_instance, directory_to_transfer, archive_directory, df):
"""
Update the project paths in a DataFrame to reflect the new archive location.
Args:
project_instance (Project): The project instance for accessing project properties.
directory_to_transfer (str): The directory containing the jobs to be transferred.
archive_directory (str): The base directory for the archive.
df (DataFrame): DataFrame containing job information, including project paths.
Returns:
list: List of updated project paths reflecting the new archive location.
"""
dir_name_transfer = getdir(path=directory_to_transfer)
dir_name_archive = getdir(path=archive_directory)

pr_transfer = project_instance.open(os.curdir)
path_rel_lst = [
os.path.relpath(p, pr_transfer.project_path) for p in df["project"].values
]

return [
(
os.path.join(dir_name_archive, dir_name_transfer, p)
if p != "."
else os.path.join(dir_name_archive, dir_name_transfer)
)
for p in path_rel_lst
]


def copy_files_to_archive(
directory_to_transfer,
archive_directory,
Expand All @@ -52,6 +21,7 @@ def copy_files_to_archive(
archive_directory (str): The destination directory for the archive.
compress (bool): If True, compress the archive directory into a tarball. Default is True.
copy_all_files (bool): If True, include all files in the archive, otherwise only .h5 files. Default is False.
arcname (str): The name of the archive directory. Default is the name of the directory to transfer.
"""

Expand All @@ -75,20 +45,16 @@ def copy_files(origin, destination, copy_all_files=copy_all_files):
arcname = os.path.relpath(os.path.abspath(archive_directory), os.getcwd())
dir_name_transfer = getdir(path=directory_to_transfer)
if not compress:
copy_files(
directory_to_transfer, os.path.join(archive_directory, dir_name_transfer)
)
copy_files(directory_to_transfer, os.path.join(archive_directory, arcname))
else:
with tempfile.TemporaryDirectory() as temp_dir:
# Copy files to the temporary directory
copy_files(directory_to_transfer, os.path.join(temp_dir, dir_name_transfer))
dest = os.path.join(temp_dir, dir_name_transfer)
copy_files(directory_to_transfer, dest)

# Compress the temporary directory into a tar.gz archive
with tarfile.open(f"{archive_directory}.tar.gz", "w:gz") as tar:
tar.add(
temp_dir,
arcname=arcname,
)
tar.add(dest, arcname=arcname)


def copy_h5_files(src, dst):
Expand All @@ -115,20 +81,16 @@ def copy_h5_files(src, dst):
shutil.copy2(src_file, os.path.join(dst_dir, file))


def export_database(pr, directory_to_transfer, archive_directory):
def export_database(pr):
"""
Export the project database to an archive directory.
Args:
pr (Project): The project instance containing the jobs.
directory_to_transfer (str): The directory containing the jobs to transfer.
archive_directory (str): The destination directory for the archive.
Returns:
DataFrame: DataFrame containing updated job information with new IDs and project paths.
"""
assert isinstance(archive_directory, str) and ".tar.gz" not in archive_directory
directory_to_transfer = os.path.basename(directory_to_transfer)

df = pr.job_table()
job_translate_dict = {
Expand All @@ -138,7 +100,7 @@ def export_database(pr, directory_to_transfer, archive_directory):
df["id"] = df["id"].map(job_translate_dict)
df["masterid"] = df["masterid"].map(job_translate_dict)
df["parentid"] = df["parentid"].map(job_translate_dict)
df["project"] = update_project(pr, directory_to_transfer, archive_directory, df)
df["project"] = df["project"].map(lambda x: os.path.relpath(x, os.getcwd()))

df.drop(columns=["projectpath"], inplace=True)
return df
12 changes: 7 additions & 5 deletions pyiron_base/project/archiving/import_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,24 @@ def import_jobs(project_instance, archive_directory, df, compressed=True):
does not have the correct format paths
as string or pyiron Project objects are expected"""
)
common_path = os.path.commonpath(list(df["project"]))
os.makedirs(archive_directory, exist_ok=True)
if compressed:
with tarfile.open(archive_directory + ".tar.gz", "r:gz") as tar:
tar.extractall()
tar.extractall(path=archive_directory)

# source folder; archive folder
src = os.path.abspath(archive_directory)
src = os.path.abspath(os.path.join(archive_directory, common_path))
copytree(src, project_instance.path, dirs_exist_ok=True)
if compressed:
rmtree(src)
rmtree(os.path.abspath(archive_directory))

# # Update Database
pr_import = project_instance.open(os.curdir)

df["project"] = [
os.path.join(
pr_import.project_path, os.path.relpath(p, getdir(path=archive_directory))
os.path.normpath(
os.path.join(pr_import.project_path, os.path.relpath(p, common_path))
)
+ "/"
for p in df["project"].values
Expand Down
10 changes: 3 additions & 7 deletions pyiron_base/project/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1957,8 +1957,6 @@ def pack(
"""
if destination_path is None:
destination_path = self.path
if os.path.isabs(destination_path):
destination_path = os.path.relpath(destination_path, os.getcwd())
if ".tar.gz" in destination_path:
destination_path = destination_path.split(".tar.gz")[0]
compress = True
Expand All @@ -1971,18 +1969,16 @@ def pack(
)
if destination_path_abs == directory_to_transfer and not compress:
raise ValueError(
"The destination_path cannot have the same name as the project to compress."
"The destination_path cannot have the same name as the project."
)
export_archive.copy_files_to_archive(
directory_to_transfer,
destination_path_abs,
compress=compress,
copy_all_files=copy_all_files,
arcname=destination_path,
)
df = export_archive.export_database(
self, directory_to_transfer, destination_path_abs
arcname=os.path.relpath(self.path, os.getcwd()),
)
df = export_archive.export_database(self)
df.to_csv(csv_file_path)

def unpack(self, origin_path, csv_file_name="export.csv", compress=True):
Expand Down
4 changes: 1 addition & 3 deletions tests/unit/archiving/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@ def test_exportedCSV(self):
# in the first test, the csv file from the packing function is read
# and is compared with the return dataframe from export_database
directory_to_transfer = os.path.basename(self.pr.path[:-1])
df_exp = export_database(
self.pr, directory_to_transfer, "archive_folder"
).dropna(axis=1)
df_exp = export_database(self.pr).dropna(axis=1)
df_exp["hamversion"] = float(df_exp["hamversion"])
self.assertEqual(df_exp["job"].unique()[0], "toy")
self.assertEqual(df_exp["id"].unique()[0], 0)
Expand Down
5 changes: 2 additions & 3 deletions tests/unit/archiving/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,8 @@ def test_import_with_targz_extension(self):
pr_imp.unpack(
origin_path=pack_path_comp, csv_file_name=pack_path_csv, compress=True
)
# here the 7 is the length of '.tar.gz' string
with tarfile.open(pack_path_comp[:-7] + ".tar.gz", "r:gz") as tar:
tar.extractall()
with tarfile.open(pack_path_comp, "r:gz") as tar:
tar.extractall(path=pack_path_comp[: -len(".tar.gz")])
compare_obj = dircmp(pack_path_comp[:-7], pr_imp.path)
self.assertEqual(len(compare_obj.diff_files), 0)
pr.remove(enable=True)
Expand Down

0 comments on commit 6658f77

Please sign in to comment.