From 5fc7039dfe34e646a16906208cd27b296d0ed332 Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Wed, 30 Oct 2024 15:12:44 +0100
Subject: [PATCH 1/8] removing mpi.pkl files at cleaning steps

---
 src/haddock/gear/clean_steps.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/haddock/gear/clean_steps.py b/src/haddock/gear/clean_steps.py
index a73915d6f..5448c8abf 100644
--- a/src/haddock/gear/clean_steps.py
+++ b/src/haddock/gear/clean_steps.py
@@ -59,16 +59,25 @@ def clean_output(path: FilePath, ncores: int = 1) -> None:
     # `unpack_compressed_and_archived_files` so that the
     # uncompressing routines when restarting the run work.
 
+    # Files to delete (all)
+    file_to_delete_all = (
+        "mpi.pkl",
+        )
+    for extension in file_to_delete_all:
+        for file_ in glob_folder(path, extension):
+            Path(file_).unlink()
+
     # Files to delete
     # deletes all except the first one
-    files_to_delete = [
+    # (keeping one for debugging purposes)
+    files_to_delete = (
         ".inp",
         ".inp.gz",
         ".out",
         ".out.gz",
         ".job",
         ".err",
-        ]
+        )
 
     for extension in files_to_delete:
         flist = glob_folder(path, extension)
@@ -76,11 +85,11 @@ def clean_output(path: FilePath, ncores: int = 1) -> None:
             Path(file_).unlink()
 
     # files to archive (all files in single .gz)
-    files_to_archive = [
+    files_to_archive = (
         ".seed",
         ".seed.gz",
         ".con",
-        ]
+        )
 
     archive_ready = partial(_archive_and_remove_files, path=path)
     _ncores = min(ncores, len(files_to_archive))
@@ -90,13 +99,13 @@ def clean_output(path: FilePath, ncores: int = 1) -> None:
             pass
 
     # files to compress in .gz
-    files_to_compress = [
+    files_to_compress = (
         ".inp",
         ".out",
         ".pdb",
         ".psf",
         ".cnserr",
-        ]
+        )
 
     for ftc in files_to_compress:
         found = compress_files_ext(path, ftc, ncores=ncores)
@@ -111,9 +120,11 @@ def _archive_and_remove_files(fta: str, path: FilePath) -> None:
 
 
 # eventually this function can be moved to `libs.libio` in case of future need.
-def unpack_compressed_and_archived_files(folders: Iterable[FilePathT],
-                                         ncores: int = 1,
-                                         dec_all: bool = False) -> None:
+def unpack_compressed_and_archived_files(
+        folders: Iterable[FilePathT],
+        ncores: int = 1,
+        dec_all: bool = False,
+        ) -> None:
     """
     Unpack compressed and archived files in a folders.
 

From 2c06f17e3ff33a70bbdea3e7a6f5788805370919 Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Wed, 30 Oct 2024 15:19:43 +0100
Subject: [PATCH 2/8] no more bioexcel survey

---
 src/haddock/gear/greetings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py
index 3e92691c7..353456e96 100644
--- a/src/haddock/gear/greetings.py
+++ b/src/haddock/gear/greetings.py
@@ -31,7 +31,7 @@
 feedback_urls = {
     "GitHub issues": "https://github.com/haddocking/haddock3/issues",
     "BioExcel feedback": "https://www.bonvinlab.org/feedback",
-    "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/",
+    # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/",
 }
 
 

From c1a429b7b9bcd018a2304411a6770f8cc7af746f Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Thu, 31 Oct 2024 16:00:27 +0100
Subject: [PATCH 3/8] local structures in analysis & archiving haddock3 run

---
 src/haddock/clis/cli.py            |  20 +-
 src/haddock/clis/cli_analyse.py    | 367 +++++++++++++++++++----------
 src/haddock/core/optional.yaml     |  10 +-
 src/haddock/gear/postprocessing.py |  46 ++++
 src/haddock/libs/libplots.py       |  43 +++-
 src/haddock/libs/libworkflow.py    |  19 +-
 6 files changed, 357 insertions(+), 148 deletions(-)
 create mode 100644 src/haddock/gear/postprocessing.py

diff --git a/src/haddock/clis/cli.py b/src/haddock/clis/cli.py
index a3681430c..65e4b78e7 100755
--- a/src/haddock/clis/cli.py
+++ b/src/haddock/clis/cli.py
@@ -75,7 +75,7 @@ def cli(ap: ArgumentParser, main: Callable[..., None]) -> None:
 
 def maincli() -> None:
     """Execute main client."""
-    cli(ap, main)
+    cli(_ap(), main)
 
 
 def main(
@@ -117,6 +117,7 @@ def main(
         get_initial_greeting,
         gen_feedback_messages,
         )
+    from haddock.gear.postprocessing import archive_run
     from haddock.gear.prepare_run import setup_run
     from haddock.libs.libio import working_directory
     from haddock.libs.liblog import (
@@ -181,10 +182,7 @@ def main(
         restart_step = restart
         WorkflowManager_ = WorkflowManager
 
-    with (
-        working_directory(_run_dir),
-        log_error_and_exit(),
-    ):
+    with (working_directory(_run_dir), log_error_and_exit()):
         workflow = WorkflowManager_(
             workflow_params=params,
             start=restart_step,
@@ -193,10 +191,20 @@ def main(
 
         # Main loop of execution
         workflow.run()
+
+        # Run post-processing steps
         if other_params["postprocess"]:
-            workflow.postprocess()
+            workflow.postprocess(self_contained=other_params["gen_archive"])
+        # Clean outputs
         workflow.clean()
 
+    # Generate archive of the run
+    if other_params["gen_archive"]:
+        _run_archive, _analysis_archive = archive_run(_run_dir)
+        log.info(f"Run archive created: {_run_archive}!")
+        if _analysis_archive:
+            log.info(f"Run analysis archive created: {_analysis_archive}")
+
     # Finish
     end = time()
     elapsed = convert_seconds_to_min_sec(end - start)
diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py
index 65ca24507..f0ef7e519 100644
--- a/src/haddock/clis/cli_analyse.py
+++ b/src/haddock/clis/cli_analyse.py
@@ -20,6 +20,7 @@
 Where, ``-m 1 3`` means that the analysis will be performed on ``1_rigidbody``
  and ``3_flexref``.
 """
+
 import argparse
 import os
 import shutil
@@ -66,59 +67,6 @@
 INTER_STR = INTERACTIVE_RE_SUFFIX  # suffix of interactive analysis folders
 
 
-def get_cluster_ranking(
-        capri_clt_filename: FilePath,
-        top_cluster: int,
-        ) -> ClRank:
-    """
-    Get capri cluster ranking.
-
-    Parameters
-    ----------
-    capri_clt_filename : str or Path
-        capri cluster filename
-    top_cluster : int
-        Number of clusters to be considered
-
-    Returns
-    -------
-    cl_ranking : dict
-        {cluster_id : cluster_rank} dictionary
-    """
-    cl_ranking: ClRank = {}
-    dfcl = read_capri_table(capri_clt_filename)
-    for n in range(min(top_cluster, dfcl.shape[0])):
-        cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n]
-    return cl_ranking
-
-
-def update_paths(
-    capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../"
-) -> None:
-    """
-    Update paths in capri_ss_filename.
-
-    Parameters
-    ----------
-    capri_ss_filename : str or Path
-        capri ss filename
-    toch : str
-        string to be replaced
-    toadd : str
-        string to be added
-    """
-    new_lines: list[str] = []
-    with open(capri_ss_filename, "r") as rfile:
-        for ln in rfile:
-            new_ln = ln.replace(toch, toadd)
-            new_lines.append(new_ln)
-
-    with open(capri_ss_filename, "w") as wfile:
-        for ln in new_lines:
-            wfile.write(ln)
-    return
-
-
 # Command line interface parser
 ap = argparse.ArgumentParser(
     prog="haddock3-analyse",
@@ -205,6 +153,14 @@ def update_paths(
     default=1,
 )
 
+ap.add_argument(
+    "--self-contained",
+    help="Should the models be accessed locally?",
+    required=False,
+    default=False,
+    type=bool,
+)
+
 
 ap.add_argument(
     "-p",
@@ -242,6 +198,59 @@ def maincli() -> None:
     cli(_ap(), main)
 
 
+def get_cluster_ranking(
+        capri_clt_filename: FilePath,
+        top_cluster: int,
+        ) -> ClRank:
+    """
+    Get capri cluster ranking.
+
+    Parameters
+    ----------
+    capri_clt_filename : str or Path
+        capri cluster filename
+    top_cluster : int
+        Number of clusters to be considered
+
+    Returns
+    -------
+    cl_ranking : dict
+        {cluster_id : cluster_rank} dictionary
+    """
+    cl_ranking: ClRank = {}
+    dfcl = read_capri_table(capri_clt_filename)
+    for n in range(min(top_cluster, dfcl.shape[0])):
+        cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n]
+    return cl_ranking
+
+
+def update_paths(
+    capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../"
+) -> None:
+    """
+    Update paths in capri_ss_filename.
+
+    Parameters
+    ----------
+    capri_ss_filename : str or Path
+        capri ss filename
+    toch : str
+        string to be replaced
+    toadd : str
+        string to be added
+    """
+    new_lines: list[str] = []
+    with open(capri_ss_filename, "r") as rfile:
+        for ln in rfile:
+            new_ln = ln.replace(toch, toadd)
+            new_lines.append(new_ln)
+
+    with open(capri_ss_filename, "w") as wfile:
+        for ln in new_lines:
+            wfile.write(ln)
+    return
+
+
 def run_capri_analysis(
         step: str,
         run_dir: FilePath,
@@ -356,67 +365,128 @@ def update_paths_in_capri_dict(
     return new_capri_dict
 
 
-def zip_top_ranked(
+def get_top_ranked_mapping(
         capri_filename: FilePath,
         cluster_ranking: ClRank,
-        summary_name: FilePath,
-        ) -> None:
-    """
-    Zip the top ranked structures.
+        clustered_topX: int = 4,
+        unclustered_topX: int = 10,
+        ) -> dict[Path, str]:
+    # Set mapping of generated files
+    top_ranked_mapping: dict[Path, str] = {}
 
-    Parameters
-    ----------
-    cluster_ranking : dict
-        {cluster_id : cluster_rank} dictionary
-    ss_file : str or Path
-        capri ss filename
-
-    Returns
-    -------
-    output_zipfile : str or Path
-        path to the zipped file
-    """
+    # Read table
     capri_df = read_capri_table(capri_filename, comment="#")
+    # Group by clusters
     gb_cluster = capri_df.groupby("cluster_id")
+
+    # Loop over clusters
     for cl_id, cl_df in gb_cluster:
+        # Filter only top clusters
         if cl_id in cluster_ranking.keys():
+            # If clustered structure
             if cl_id != "-":
-                structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]]
+                # Retrieve only top 4 models per cluster
+                structs = cl_df.loc[cl_df["model-cluster_ranking"] <= clustered_topX][["model", "model-cluster_ranking"]]  # noqa : E501
+            # If un-clustered structures
             else:
-                structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]]
+                # Retrieve top 10
+                structs = cl_df.loc[cl_df["caprieval_rank"] <= unclustered_topX][["model", "caprieval_rank"]]  # noqa : E501
+            # Rename columns to access them using same keywords
             structs.columns = ["model", "rank"]
             # iterate over the structures
             for _, row in structs.iterrows():
-                struct = Path(row["model"])
-                struct_gz = Path(f"{struct}.gz")
+                # Point rank
                 rank = row["rank"]
                 # set target name
                 if cl_id != "-":
-                    target_name = f"cluster_{cluster_ranking[cl_id]}_model_{rank}.pdb"
+                    # Give it its cluster name
+                    target_name = (
+                        f"cluster_{cluster_ranking[cl_id]}"
+                        f"_model_{rank}.pdb"
+                        )
                 else:
+                    # Give it its rank name
                     target_name = f"model_{rank}.pdb"
+
+                # Generate structure path
+                struct = Path(row["model"])
+                struct_gz = Path(f"{struct}.gz")
                 # copy the structure
                 if Path(struct).exists():
-                    shutil.copy(struct, Path(target_name))
+                    top_ranked_mapping[struct] = target_name
                 elif struct_gz.exists():
-                    shutil.copy(struct_gz, ".")
-                    # unpack the file
-                    _unpack_gz(Path(".", struct_gz.name))
-                    shutil.move(struct.name, Path(target_name))
+                    top_ranked_mapping[struct_gz] = target_name
                 else:
                     log.warning(f"structure {struct} not found")
+    return top_ranked_mapping
 
-    # now make the archive and delete the pdb files
-    archive_files_ext(".", "pdb")
-    for file in Path(".").glob("*.pdb"):
-        file.unlink()
-    # move archive to summary
-    expected_archive = Path(".", "pdb.tgz")
-    if expected_archive.exists():
-        shutil.move("pdb.tgz", summary_name)
-        log.info(f"Summary archive {summary_name} created!")
+def zip_top_ranked(
+        top_ranked_mapping: dict[Path, str],
+        summary_name: str,
+        gen_archive: bool,
+        ) -> Optional[Path]:
+    """
+    Zip the top ranked structures.
+
+    Parameters
+    ----------
+    capri_filename : str or Path
+        capri ss filename
+    cluster_ranking : dict
+        {cluster_id : cluster_rank} dictionary
+    summary_name: str
+        Base name of the archive to be generated
+    gen_archive: bool
+        Should the archive be generated?
+    clustered_topX: int
+        Number of models to access per cluster. Default is 4.
+    unclustered_topX: int
+        Number of models to access when no clusters. Default is 10.
+
+    Return
+    ------
+    output_fname : Optional[Path]
+        Path to the generated output. Can be a .tgz archive or a directory.
+    """
+    for ori_fpath, new_name in top_ranked_mapping.items():
+        # If already compressed
+        if ori_fpath.suffix == ".gz":
+            copied_fpath = shutil.copy(ori_fpath, ".")
+            # unpack the file
+            _unpack_gz(copied_fpath.name)
+            # Rename it
+            shutil.move(copied_fpath.name.replace(".gz", ""), new_name)
+        else:
+            shutil.copy(ori_fpath, new_name)
+
+    # Compress pdb files
+    if gen_archive:
+        archive_was_created = archive_files_ext(".", "pdb")
+        # Delete the pdb files
+        for file_ in top_ranked_mapping.values():
+            file_.unlink()
+        output_fname = Path(f"{summary_name}.tgz")
+        if archive_was_created:
+            # move archive to summary
+            shutil.move("pdb.tgz", output_fname)
+            log.info(f"Top structures summary archive {output_fname} created!")
+            return 
+        else:
+            log.warning(f"Summary archive {output_fname} not created!")
+            return None
+    # Generate a directory holding all the structures
     else:
-        log.warning(f"Summary archive {summary_name} not created!")
+        output_fname = Path(summary_name)
+        output_fname.mkdir(parents=True, exist_ok=True)
+        for ori_fpath, new_name in top_ranked_mapping.items():
+            # Create new path
+            next_filepath = Path(output_fname, str(new_name))
+            # Hold it in mapping dict
+            top_ranked_mapping[ori_fpath] = str(next_filepath)
+            # Displace file
+            shutil.move(new_name, top_ranked_mapping[ori_fpath])
+        log.info(f"Top structures copied into {output_fname}!")
+        return output_fname
 
 
 def analyse_step(
@@ -431,6 +501,9 @@ def analyse_step(
     offline: bool = False,
     mode: str = "local",
     ncores: int = 4,
+    self_contained: bool = False,
+    clustered_topX: int = 4,
+    unclustered_topX: int = 10,
 ) -> None:
     """
     Analyse a step.
@@ -454,13 +527,31 @@ def analyse_step(
         Produce images in the selected format.
     scale : int
         scale for images.
+    is_cleaned: bool
+        is the directory going to be cleaned?
+    offline: bool
+        Should plots js functions be self-contained?
+    mode: str
+        mode of execution
+    ncores: int
+        number of cores to use
+    self_contained : bool
+        Should the analysis directory contain the models?
+    clustered_topX: int
+        Number of models to access per cluster. Default is 4.
+    unclustered_topX: int
+        Number of models to access when no clusters. Default is 10.
     """
     log.info(f"Analysing step {step}")
-
+    # Create directory
     target_path.mkdir(parents=True, exist_ok=False)
+    # Build caprieval output file names/paths
+    ss_filename = Path("capri_ss.tsv")
+    clt_filename = Path("capri_clt.tsv")
     step_name = step.split("_")[1]
-    ss_fname = Path(run_dir, f"{step}/capri_ss.tsv")
-    clt_fname = Path(run_dir, f"{step}/capri_clt.tsv")
+    ss_fname = Path(run_dir, f"{step}/{ss_filename}")
+    clt_fname = Path(run_dir, f"{step}/{clt_filename}")
+    # Search for caprieval output files
     if step_name != "caprieval":
         if ss_fname.exists() and clt_fname.exists():
             log.info(f"step {step} has caprieval data, files are available")
@@ -472,44 +563,62 @@ def analyse_step(
         log.info(f"step {step} is caprieval, files should be already available")
         run_capri = False
 
-    if run_capri == False:
+    # If caprieval data available, just copy them
+    if not run_capri:
         shutil.copy(ss_fname, target_path)
         shutil.copy(clt_fname, target_path)
 
     # Go to directory where to write all the analysis figures / report
     os.chdir(target_path)
     # if the step is not caprieval, caprieval must be run
-    if run_capri == True:
+    if run_capri:
         run_capri_analysis(step, run_dir, capri_dict, is_cleaned, mode, ncores)
 
     log.info("CAPRI files identified")
     # plotting
-    ss_file = Path("capri_ss.tsv")
-    clt_file = Path("capri_clt.tsv")
-    if clt_file.exists():
-        cluster_ranking = get_cluster_ranking(clt_file, top_cluster)
+    if clt_filename.exists():
+        cluster_ranking = get_cluster_ranking(clt_filename, top_cluster)
     else:
-        raise Exception(f"clustering file {clt_file} does not exist")
-    if ss_file.exists():
+        raise Exception(f"clustering file {clt_filename} does not exist")
+    if ss_filename.exists():
+        # Generate file mapping for top ranked structures
+        top_ranked_mapping = get_top_ranked_mapping(
+            ss_filename,
+            cluster_ranking,
+            clustered_topX=clustered_topX,
+            unclustered_topX=unclustered_topX,
+            )
+        # provide a zipped archive of the top ranked structures
+        zip_top_ranked(
+            top_ranked_mapping,
+            "summary",
+            not self_contained,
+            )
         log.info("Plotting results..")
         scatters = scatter_plot_handler(
-            ss_file,
+            ss_filename,
             cluster_ranking,
             format,
             scale,
             offline=offline,
             )
         boxes = box_plot_handler(
-            ss_file,
+            ss_filename,
             cluster_ranking,
             format,
             scale,
             offline=offline,
             )
-        tables = clt_table_handler(clt_file, ss_file, is_cleaned)
+        tables = clt_table_handler(
+            clt_filename,
+            ss_filename,
+            is_cleaned,
+            topX_clusters=top_cluster,
+            clustered_topX=clustered_topX,
+            unclustered_topX=unclustered_topX,
+            top_ranked_mapping=top_ranked_mapping if self_contained else None,
+            )
         report_generator(boxes, scatters, tables, step, ".", offline)
-        # provide a zipped archive of the top ranked structures
-        zip_top_ranked(ss_file, cluster_ranking, Path("summary.tgz"))
 
 
 def validate_format(_format: Optional[ImgFormat]) -> Optional[ImgFormat]:
@@ -570,6 +679,7 @@ def main(
     offline: bool = False,
     mode: Optional[str] = None,
     ncores: Optional[int] = None,
+    self_contained: bool = False,
     **kwargs: Any,
 ) -> None:
     """
@@ -579,33 +689,26 @@ def main(
     ----------
     run_dir : str or Path
         Path to the original run directory.
-
     modules : list of ints
         List of the integer prefix of the modules to copy.
-
     top_cluster : int
         Number of clusters to be considered.
-
     format : str
         Produce images in the selected format.
-
     scale : int
         scale for images.
-
     inter: bool
         analyse only steps labelled as 'interactive'
-
     is_cleaned: bool
         is the directory going to be cleaned?
-    
     offline: bool
         Should plots js functions be self-contained?
-    
     mode: str
         mode of execution
-    
     ncores: int
         number of cores to use
+    self_contained : bool
+        Should the analysis directory contain the models?
     """
     log.level = 20
     log.info(
@@ -646,7 +749,7 @@ def main(
     bad_folder_paths: list[Path] = []
     for step in sel_steps:
         subfolder_name = f"{step}_analysis"
-        target_path = Path(Path("./"), subfolder_name)
+        target_path = Path(subfolder_name)
 
         # check if subfolder is already present
         dest_path = Path(ANA_FOLDER, subfolder_name)
@@ -662,7 +765,6 @@ def main(
                 shutil.rmtree(dest_path)
 
         # run the analysis
-        error = False
         try:
             analyse_step(
                 step,
@@ -676,14 +778,14 @@ def main(
                 offline=offline,
                 mode=mode,
                 ncores=ncores,
-            )
+                #self_contained=self_contained,
+                self_contained=True,
+                )
         except Exception as e:
-            error = True
             log.warning(
                 f"Could not execute the analysis for step {step}. "
                 f"The following error occurred {e}"
                 )
-        if error:
             bad_folder_paths.append(target_path)
         else:
             good_folder_paths.append(target_path)
@@ -694,8 +796,23 @@ def main(
     # moving files into analysis folder
     if good_folder_paths != []:
         log.info("moving files to analysis folder")
+        urls: list[str] = []
         for directory in good_folder_paths:
             shutil.move(directory, outdir)
+            url = f"- [{Path(directory, 'report.html')}](http://0.0.0.0:8000/{Path(directory, 'report.html')}) "  # noqa : E501
+            urls.append(url)
+        
+        # Adding instructions on how to setup the server
+        readme_fpath = Path(outdir, "README.md")
+        readme_fpath.write_text(
+            f"# Usage{os.linesep}{os.linesep}"
+            "To view structures or download the structure files, "
+            f"in a terminal run the command:{os.linesep}```bash{os.linesep}"
+            f"python -m http.server --directory .{os.linesep}```{os.linesep}"
+            f"And open the link following links in a web browser:{os.linesep}"
+            f"{os.linesep.join(urls)}{os.linesep}"
+            )
+        assert readme_fpath.exists()
 
     if bad_folder_paths != []:
         log.info("cancelling unsuccesful analysis folders")
@@ -714,12 +831,12 @@ def main(
         log.info(f"View the results in {report_file}")
         info_msg = (
             "To view structures or download the structure files, "
-            f"in a terminal run the command "
-            f"`python -m http.server --directory {rundir_cwd}`. "
-            "By default, http server runs on `http://0.0.0.0:8000/`. "
-            f"Open the link http://0.0.0.0:8000/{report_file} "
+            f"in a terminal run the command: {os.linesep}"
+            f">python -m http.server --directory {rundir_cwd}{os.linesep}"
+            # "By default, http server runs on `http://0.0.0.0:8000/`. "
+            f"And open the link http://0.0.0.0:8000/{report_file} "
             "in a web browser."
-        )
+            )
         log.info(info_msg)
     os.chdir(ori_cwd)
     return
diff --git a/src/haddock/core/optional.yaml b/src/haddock/core/optional.yaml
index 3e23e7f33..a07a3d9c1 100644
--- a/src/haddock/core/optional.yaml
+++ b/src/haddock/core/optional.yaml
@@ -21,4 +21,12 @@ postprocess:
     used to plot the results of a HADDOCK3 workflow. If this option, this command
     is automatically executed at the end of the workflow (on the caprieval folders).
   explevel: easy
-
+gen_archive:
+  default: false
+  type: boolean
+  title: Generates an archive of the run and of the analysis.
+  short: If true, executes haddock3-analyse in self_contained mode and generates
+    archives of the run_directory and of the analysis in two separated tgz files.
+  long: If true, executes haddock3-analyse in self_contained mode and generates
+    archives of the run_directory and of the analysis in two separated tgz files.
+  explevel: easy
diff --git a/src/haddock/gear/postprocessing.py b/src/haddock/gear/postprocessing.py
new file mode 100644
index 000000000..349dcbb6b
--- /dev/null
+++ b/src/haddock/gear/postprocessing.py
@@ -0,0 +1,46 @@
+"""Tools for post-processing haddock3 runs."""
+
+import os
+import shutil
+import tarfile
+
+from haddock import log
+from haddock.clis.cli_analyse import ANA_FOLDER
+from haddock.core.typing import Optional
+
+
+def archive_run(run_dir: str, delete: bool = True) -> tuple[str, Optional[str]]:
+    """Create an archive of the haddock3 run directory and analysis.
+
+    Parameters
+    ----------
+    run_dir : str
+        Path to the run directory
+    delete : bool, optional
+        Should the un-archived directory be deleted?, by default False
+
+    Returns
+    -------
+    tuple[str, Optional[str]]
+        run_archive_fname : str
+            Path to the run archive
+        analysis_archive_fname : Optional[str]
+            Path to the run analysis archive
+    """
+    log.info("Creating an archive of the run")
+    # Start by archiving the run_directory
+    run_archive_fname = f"{run_dir}.tgz"
+    with tarfile.open(run_archive_fname, "w:gz") as tar:
+        tar.add(run_dir, arcname=os.path.basename(run_dir))
+
+    # Archive the analysis directory
+    analysis_archive_fname = None
+    if os.path.exists(f"{run_dir}/{ANA_FOLDER}"):
+        analysis_archive_fname = f"{run_dir}_{ANA_FOLDER}.tgz"
+        with tarfile.open(analysis_archive_fname, "w:gz") as tar:
+            tar.add(f"{run_dir}/{ANA_FOLDER}", arcname=f"{run_dir}_{ANA_FOLDER}")
+
+    if delete:
+        shutil.rmtree(run_dir)
+
+    return run_archive_fname, analysis_archive_fname
\ No newline at end of file
diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py
index 0747b2192..b85f2bae7 100644
--- a/src/haddock/libs/libplots.py
+++ b/src/haddock/libs/libplots.py
@@ -877,7 +877,15 @@ def create_other_cluster(
     return clusters_df, structs_df
 
 
-def clt_table_handler(clt_file, ss_file, is_cleaned=False):
+def clt_table_handler(
+        clt_file: FilePath,
+        ss_file: FilePath,
+        is_cleaned: bool = False,
+        topX_clusters: int = 10,
+        clustered_topX: int = 4,
+        unclustered_topX: int = 10,
+        top_ranked_mapping: Optional[dict[Path, str]] = None,
+        ) -> pd.DataFrame:
     """
     Create a dataframe including data for tables.
 
@@ -906,23 +914,34 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False):
     clusters_df = clusters_df.round(2)
     structs_df = structs_df.round(2)
 
-    # if the run will be cleaned, the structures are going to be gzipped
-    if is_cleaned:
-        # substitute the values in the df by adding .gz at the end
-        structs_df['model'] = structs_df['model'].replace(
-            to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True,
-            )
+    if not top_ranked_mapping:
+        # if the run will be cleaned, the structures are going to be gzipped
+        if is_cleaned and not top_ranked_mapping:
+            # substitute the values in the df by adding .gz at the end
+            structs_df['model'] = structs_df['model'].replace(
+                to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True,
+                )
 
     # ss_file is in NN_caprieval/ while report is in
     # analysis/NN_caprieval_analysis/
     # need to correct model paths by prepending ../
-    structs_df['model'] = structs_df['model'].apply(lambda x: f"../{x}")
+    def correct_relative_paths(
+            path: str,
+            top_ranked_mapping: Optional[dict[Path, str]],
+            ) -> str:
+        try:
+            new_path = top_ranked_mapping[Path(path)]
+        except KeyError:
+            new_path = f"../{path}"
+        return new_path
+    structs_df['model'] = structs_df['model'].apply(
+        lambda x: correct_relative_paths(x, top_ranked_mapping)
+        )
 
     is_unclustered = clusters_df["cluster_rank"].unique().tolist() == ["-"]
     # If unclustered, we only want to show the top 10 structures in a table.
     if is_unclustered:
-        max_unstructured_structures = 10
-        structs_df = structs_df[:max_unstructured_structures]
+        structs_df = structs_df[:unclustered_topX]
         cols2keep = ['caprieval_rank', 'model'] + list(AXIS_NAMES.keys())
         structs_df = structs_df[cols2keep]
         # model has ../../01_rigidbody/rigidbody_62.pdb.gz
@@ -933,11 +952,11 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False):
     clusters_df, structs_df = create_other_cluster(
         clusters_df,
         structs_df,
-        max_clusters=11,
+        max_clusters=topX_clusters + 1,
         )
 
     clusters_df = clean_capri_table(clusters_df)
-    structs_df = find_best_struct(structs_df, max_best_structs=4)
+    structs_df = find_best_struct(structs_df, max_best_structs=clustered_topX)
     df_merged = pd.merge(clusters_df, structs_df, on="cluster_id")
     return df_merged
 
diff --git a/src/haddock/libs/libworkflow.py b/src/haddock/libs/libworkflow.py
index ca2c45ccb..e789b79f3 100644
--- a/src/haddock/libs/libworkflow.py
+++ b/src/haddock/libs/libworkflow.py
@@ -59,7 +59,7 @@ def clean(self, terminated: Optional[int] = None) -> None:
         for step in self.recipe.steps[:terminated]:
             step.clean()
 
-    def postprocess(self) -> None:
+    def postprocess(self, self_contained: bool = False) -> None:
         """Postprocess the workflow."""
         # is the workflow going to be cleaned?
         is_cleaned = self.recipe.steps[0].config['clean']
@@ -69,14 +69,25 @@ def postprocess(self) -> None:
         mode = self.recipe.steps[0].config['mode']
         # ncores
         ncores = self.recipe.steps[0].config['ncores']
-        
+
         capri_steps: list[int] = []
         for step in self.recipe.steps:
             if step.module_name == "caprieval":
                 capri_steps.append(step.order)  # type: ignore
         # call cli_analyse (no need for capri_dicts, it's all precalculated)
-        cli_analyse("./", capri_steps, top_cluster=10, format=None, scale=None,
-                inter=False, is_cleaned=is_cleaned, offline=offline, mode=mode, ncores=ncores)
+        cli_analyse(
+            "./",
+            capri_steps,
+            top_cluster=10,
+            format=None,
+            scale=None,
+            inter=False,
+            is_cleaned=is_cleaned,
+            offline=offline,
+            mode=mode,
+            ncores=ncores,
+            self_contained=self_contained,
+            )
         # call cli_traceback. If it fails, it's not a big deal
         try:
             cli_traceback("./", offline=offline)

From 3ea287d3a1ed9cf03ce5e28ca64eff9fc06a5a67 Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Fri, 1 Nov 2024 16:23:12 +0100
Subject: [PATCH 4/8] update test

---
 src/haddock/clis/cli_analyse.py |  2 +-
 tests/test_cli_analyse.py       | 26 ++++++++++++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py
index f0ef7e519..9baee2ca5 100644
--- a/src/haddock/clis/cli_analyse.py
+++ b/src/haddock/clis/cli_analyse.py
@@ -464,7 +464,7 @@ def zip_top_ranked(
         archive_was_created = archive_files_ext(".", "pdb")
         # Delete the pdb files
         for file_ in top_ranked_mapping.values():
-            file_.unlink()
+            Path(file_).unlink()
         output_fname = Path(f"{summary_name}.tgz")
         if archive_was_created:
             # move archive to summary
diff --git a/tests/test_cli_analyse.py b/tests/test_cli_analyse.py
index bf5d4325c..224ec3d9f 100644
--- a/tests/test_cli_analyse.py
+++ b/tests/test_cli_analyse.py
@@ -8,6 +8,7 @@
 
 from haddock.clis.cli_analyse import (
     get_cluster_ranking,
+    get_top_ranked_mapping,
     main,
     update_capri_dict,
     zip_top_ranked,
@@ -51,11 +52,13 @@ def test_update_capri_dict(default_capri):
 def test_get_cluster_ranking(example_capri_clt):
     """Test get_cluster_ranking."""
     obs_cl_ranking = get_cluster_ranking(example_capri_clt, 5)
-    exp_cl_ranking = {16: 1,
-                      1: 2,
-                      13: 3,
-                      4: 4,
-                      5: 5}
+    exp_cl_ranking = {
+        16: 1,
+        1: 2,
+        13: 3,
+        4: 4,
+        5: 5,
+        }
     assert exp_cl_ranking == obs_cl_ranking
 
 
@@ -109,9 +112,20 @@ def test_zip_top_ranked(example_capri_ss, monkeypatch):
         monkeypatch.chdir(rigid_dir_analysis)
         
         exp_cl_ranking = {1: 2}
-        zip_top_ranked(example_capri_ss, exp_cl_ranking, "summary.tgz")
+        top_ranked_mapping = get_top_ranked_mapping(
+            example_capri_ss,
+            exp_cl_ranking,
+            )
+        # Archive version
+        zip_top_ranked(top_ranked_mapping, "summary", True)
         assert os.path.isfile("summary.tgz") is True
 
+        # Non-Archived version
+        zip_top_ranked(top_ranked_mapping, "notarchived", False)
+        assert not os.path.isfile("notarchived.tgz")
+        assert os.path.isdir("notarchived")
+        assert len(list(Path("notarchived").glob("*.pdb"))) > 0
+
 
 def test_main_offline(example_capri_ss, example_capri_clt, tmp_path):
     """Test cli_analyse main in offline mode."""

From 5570d4414816567cde9fd83c1fc8f50b230a624c Mon Sep 17 00:00:00 2001
From: Victor Reys <132575181+VGPReys@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:06:44 +0100
Subject: [PATCH 5/8] Update src/haddock/gear/greetings.py

---
 src/haddock/gear/greetings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py
index 9565c5bd2..e34869b93 100644
--- a/src/haddock/gear/greetings.py
+++ b/src/haddock/gear/greetings.py
@@ -30,7 +30,7 @@
 # Do not hesitate to update / comment one of these
 feedback_urls = {
     "GitHub issues": "https://github.com/haddocking/haddock3/issues",
-    "BioExcel feedback": "https://www.bonvinlab.org/feedback"
+    "BioExcel feedback": "https://www.bonvinlab.org/feedback",
     # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/",
     "BioExcel forum": "https://ask.bioexcel.eu/c/haddock/6",
 }

From 03d01431dc438eca0198e310cd4fe462f6650b51 Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Mon, 27 Jan 2025 16:51:47 +0100
Subject: [PATCH 6/8] fix deprecation warning on regex using backslash

---
 src/haddock/modules/topology/topoaa/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/haddock/modules/topology/topoaa/__init__.py b/src/haddock/modules/topology/topoaa/__init__.py
index 22373dc26..ad3596074 100644
--- a/src/haddock/modules/topology/topoaa/__init__.py
+++ b/src/haddock/modules/topology/topoaa/__init__.py
@@ -160,8 +160,8 @@ def get_ensemble_origin(ensemble_f: FilePath) -> dict[int, str]:
         lines = text.split(os.linesep)
         REMARK_lines = (line for line in lines if line.startswith("REMARK"))
         re_origin = re.compile(
-            "REMARK\s+MODEL\s+(\d+)\s+(FROM|from|From)\s+(([\w_-]+\.?)+)"
-        )  # noqa : E501
+            r"REMARK\s+MODEL\s+(\d+)\s+(FROM|from|From)\s+(([\w_-]+\.?)+)"
+        )
         for line in REMARK_lines:
             if match := re_origin.search(line):
                 model_num = int(match.group(1).strip())

From 4d680635f230591e2cee324fe0441ff47d166ef2 Mon Sep 17 00:00:00 2001
From: VGPReys <v.g.p.reys@uu.nl>
Date: Wed, 29 Jan 2025 10:52:00 +0100
Subject: [PATCH 7/8] fix analyse on interactive steps

---
 integration_tests/test_full_workflow.py | 28 ++++++++++-------
 src/haddock/clis/cli_analyse.py         | 21 ++++++-------
 src/haddock/libs/libplots.py            | 41 +++++++++++++------------
 3 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/integration_tests/test_full_workflow.py b/integration_tests/test_full_workflow.py
index ad351ba1f..51aeb6278 100644
--- a/integration_tests/test_full_workflow.py
+++ b/integration_tests/test_full_workflow.py
@@ -2,8 +2,13 @@
 from pathlib import Path
 import os
 import shutil
-from haddock.libs.libworkflow import WorkflowManager
+
+from haddock.clis.cli import main as cli_main
+from haddock.clis.cli_analyse import main as cli_analyse
+from haddock.clis.cli_re import maincli
 from haddock.core.typing import Any
+from haddock.libs.libworkflow import WorkflowManager
+
 from integration_tests import GOLDEN_DATA
 
 
@@ -66,7 +71,7 @@ def test_interactive_analysis_on_workflow(monkeypatch):
 
         monkeypatch.chdir(tmpdir)
 
-        from haddock.clis.cli import main as cli_main
+        
         cli_main(
             Path("workflow.cfg"),
         )
@@ -85,7 +90,7 @@ def test_interactive_analysis_on_workflow(monkeypatch):
 
         # now running interactive re-clustering
         clustfcc_dir = f"{run_dir}/2_clustfcc"
-        from haddock.clis.cli_re import maincli
+        
         # faking sys.argv in input to haddock3-re
         monkeypatch.setattr("sys.argv",
                             ["haddock3-re", "clustfcc", clustfcc_dir, "-f", "0.7"]
@@ -106,14 +111,15 @@ def test_interactive_analysis_on_workflow(monkeypatch):
         assert Path(run_dir, "3_caprieval_interactive/capri_ss.tsv").exists() is True
 
         # now analyse the interactive folders
-        from haddock.clis.cli_analyse import main as cli_analyse
-        cli_analyse(run_dir,
-                    [2,3],
-                    10,
-                    format=None,
-                    scale=None,
-                    is_cleaned=True,
-                    inter=True)
+        cli_analyse(
+            run_dir,
+            [2, 3],
+            10,
+            format=None,
+            scale=None,
+            is_cleaned=True,
+            inter=True,
+            )
         exp_clustfcc_dir = Path(run_dir, "analysis", "2_clustfcc_interactive_analysis")
         exp_caprieval_dir = Path(run_dir, "analysis", "3_caprieval_interactive_analysis")
         assert os.path.isdir(exp_clustfcc_dir) is True
diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py
index a2df10255..1b529d88d 100644
--- a/src/haddock/clis/cli_analyse.py
+++ b/src/haddock/clis/cli_analyse.py
@@ -370,9 +370,9 @@ def get_top_ranked_mapping(
         cluster_ranking: ClRank,
         clustered_topX: int = 4,
         unclustered_topX: int = 10,
-        ) -> dict[Path, str]:
+        ) -> dict[Path, Path]:
     # Set mapping of generated files
-    top_ranked_mapping: dict[Path, str] = {}
+    top_ranked_mapping: dict[Path, Path] = {}
 
     # Read table
     capri_df = read_capri_table(capri_filename, comment="#")
@@ -413,15 +413,15 @@ def get_top_ranked_mapping(
                 struct_gz = Path(f"{struct}.gz")
                 # copy the structure
                 if Path(struct).exists():
-                    top_ranked_mapping[struct] = target_name
+                    top_ranked_mapping[struct] = Path(target_name)
                 elif struct_gz.exists():
-                    top_ranked_mapping[struct_gz] = target_name
+                    top_ranked_mapping[struct_gz] = Path(target_name)
                 else:
                     log.warning(f"structure {struct} not found")
     return top_ranked_mapping
 
 def zip_top_ranked(
-        top_ranked_mapping: dict[Path, str],
+        top_ranked_mapping: dict[Path, Path],
         summary_name: str,
         gen_archive: bool,
         ) -> Optional[Path]:
@@ -451,9 +451,9 @@ def zip_top_ranked(
     for ori_fpath, new_name in top_ranked_mapping.items():
         # If already compressed
         if ori_fpath.suffix == ".gz":
-            copied_fpath = shutil.copy(ori_fpath, ".")
+            copied_fpath = Path(shutil.copy(ori_fpath, "."))
             # unpack the file
-            _unpack_gz(copied_fpath.name)
+            _unpack_gz(copied_fpath)
             # Rename it
             shutil.move(copied_fpath.name.replace(".gz", ""), new_name)
         else:
@@ -482,7 +482,7 @@ def zip_top_ranked(
             # Create new path
             next_filepath = Path(output_fname, str(new_name))
             # Hold it in mapping dict
-            top_ranked_mapping[ori_fpath] = str(next_filepath)
+            top_ranked_mapping[ori_fpath] = Path(next_filepath)
             # Displace file
             shutil.move(new_name, top_ranked_mapping[ori_fpath])
         log.info(f"Top structures copied into {output_fname}!")
@@ -778,13 +778,12 @@ def main(
                 offline=offline,
                 mode=mode,
                 ncores=ncores,
-                #self_contained=self_contained,
-                self_contained=True,
+                self_contained=self_contained,
                 )
         except Exception as e:
             log.warning(
                 f"Could not execute the analysis for step {step}. "
-                f"The following error occurred {e}"
+                f"The following error occurred: {e}"
                 )
             bad_folder_paths.append(target_path)
         else:
diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py
index b85f2bae7..18e3bcd46 100644
--- a/src/haddock/libs/libplots.py
+++ b/src/haddock/libs/libplots.py
@@ -884,7 +884,7 @@ def clt_table_handler(
         topX_clusters: int = 10,
         clustered_topX: int = 4,
         unclustered_topX: int = 10,
-        top_ranked_mapping: Optional[dict[Path, str]] = None,
+        top_ranked_mapping: Optional[dict[Path, Path]] = None,
         ) -> pd.DataFrame:
     """
     Create a dataframe including data for tables.
@@ -914,29 +914,30 @@ def clt_table_handler(
     clusters_df = clusters_df.round(2)
     structs_df = structs_df.round(2)
 
+    # if the run will be cleaned, the structures are going to be gzipped
     if not top_ranked_mapping:
-        # if the run will be cleaned, the structures are going to be gzipped
-        if is_cleaned and not top_ranked_mapping:
+        if is_cleaned:
             # substitute the values in the df by adding .gz at the end
             structs_df['model'] = structs_df['model'].replace(
                 to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True,
-                )
-
-    # ss_file is in NN_caprieval/ while report is in
-    # analysis/NN_caprieval_analysis/
-    # need to correct model paths by prepending ../
-    def correct_relative_paths(
-            path: str,
-            top_ranked_mapping: Optional[dict[Path, str]],
-            ) -> str:
-        try:
-            new_path = top_ranked_mapping[Path(path)]
-        except KeyError:
-            new_path = f"../{path}"
-        return new_path
-    structs_df['model'] = structs_df['model'].apply(
-        lambda x: correct_relative_paths(x, top_ranked_mapping)
-        )
+            )
+    else:
+        # ss_file is in NN_caprieval/ while report is in
+        # analysis/NN_caprieval_analysis/
+        # need to correct model paths by prepending ../
+        def correct_relative_paths(
+                path: str,
+                top_ranked_mapping: Optional[dict[Path, Path]],
+                ) -> str:
+            try:
+                new_path = top_ranked_mapping[path]
+            except (KeyError, TypeError, ):
+                new_path = f"../{path}"
+            return new_path
+        
+        structs_df['model'] = structs_df['model'].apply(
+            lambda x: correct_relative_paths(x, top_ranked_mapping)
+            )
 
     is_unclustered = clusters_df["cluster_rank"].unique().tolist() == ["-"]
     # If unclustered, we only want to show the top 10 structures in a table.

From 3d1920c1806725ddd0a9ca271911d8605203cf08 Mon Sep 17 00:00:00 2001
From: Marco Giulini <54807167+mgiulini@users.noreply.github.com>
Date: Thu, 30 Jan 2025 17:33:37 +0100
Subject: [PATCH 8/8] Update src/haddock/clis/cli_analyse.py

Co-authored-by: Victor Reys <132575181+VGPReys@users.noreply.github.com>
---
 src/haddock/clis/cli_analyse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py
index 1b529d88d..c306a30d7 100644
--- a/src/haddock/clis/cli_analyse.py
+++ b/src/haddock/clis/cli_analyse.py
@@ -155,7 +155,7 @@
 
 ap.add_argument(
     "--self-contained",
-    help="Should the models be accessed locally?",
+    help="If self-contained is set, models will be copied locally in the analysis directory, allowing to visualize structures outside of the haddock3 run.",
     required=False,
     default=False,
     type=bool,