From 5fc7039dfe34e646a16906208cd27b296d0ed332 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 30 Oct 2024 15:12:44 +0100 Subject: [PATCH 1/8] removing mpi.pkl files at cleaning steps --- src/haddock/gear/clean_steps.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/haddock/gear/clean_steps.py b/src/haddock/gear/clean_steps.py index a73915d6f..5448c8abf 100644 --- a/src/haddock/gear/clean_steps.py +++ b/src/haddock/gear/clean_steps.py @@ -59,16 +59,25 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: # `unpack_compressed_and_archived_files` so that the # uncompressing routines when restarting the run work. + # Files to delete (all) + file_to_delete_all = ( + "mpi.pkl", + ) + for extension in file_to_delete_all: + for file_ in glob_folder(path, extension): + Path(file_).unlink() + # Files to delete # deletes all except the first one - files_to_delete = [ + # (keeping one for debugging purposes) + files_to_delete = ( ".inp", ".inp.gz", ".out", ".out.gz", ".job", ".err", - ] + ) for extension in files_to_delete: flist = glob_folder(path, extension) @@ -76,11 +85,11 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: Path(file_).unlink() # files to archive (all files in single .gz) - files_to_archive = [ + files_to_archive = ( ".seed", ".seed.gz", ".con", - ] + ) archive_ready = partial(_archive_and_remove_files, path=path) _ncores = min(ncores, len(files_to_archive)) @@ -90,13 +99,13 @@ def clean_output(path: FilePath, ncores: int = 1) -> None: pass # files to compress in .gz - files_to_compress = [ + files_to_compress = ( ".inp", ".out", ".pdb", ".psf", ".cnserr", - ] + ) for ftc in files_to_compress: found = compress_files_ext(path, ftc, ncores=ncores) @@ -111,9 +120,11 @@ def _archive_and_remove_files(fta: str, path: FilePath) -> None: # eventually this function can be moved to `libs.libio` in case of future need. -def unpack_compressed_and_archived_files(folders: Iterable[FilePathT], - ncores: int = 1, - dec_all: bool = False) -> None: +def unpack_compressed_and_archived_files( + folders: Iterable[FilePathT], + ncores: int = 1, + dec_all: bool = False, + ) -> None: """ Unpack compressed and archived files in a folders. From 2c06f17e3ff33a70bbdea3e7a6f5788805370919 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 30 Oct 2024 15:19:43 +0100 Subject: [PATCH 2/8] no more bioexcel survey --- src/haddock/gear/greetings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py index 3e92691c7..353456e96 100644 --- a/src/haddock/gear/greetings.py +++ b/src/haddock/gear/greetings.py @@ -31,7 +31,7 @@ feedback_urls = { "GitHub issues": "https://github.com/haddocking/haddock3/issues", "BioExcel feedback": "https://www.bonvinlab.org/feedback", - "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", + # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", } From c1a429b7b9bcd018a2304411a6770f8cc7af746f Mon Sep 17 00:00:00 2001 From: VGPReys Date: Thu, 31 Oct 2024 16:00:27 +0100 Subject: [PATCH 3/8] local structures in analysis & archiving haddock3 run --- src/haddock/clis/cli.py | 20 +- src/haddock/clis/cli_analyse.py | 367 +++++++++++++++++++---------- src/haddock/core/optional.yaml | 10 +- src/haddock/gear/postprocessing.py | 46 ++++ src/haddock/libs/libplots.py | 43 +++- src/haddock/libs/libworkflow.py | 19 +- 6 files changed, 357 insertions(+), 148 deletions(-) create mode 100644 src/haddock/gear/postprocessing.py diff --git a/src/haddock/clis/cli.py b/src/haddock/clis/cli.py index a3681430c..65e4b78e7 100755 --- a/src/haddock/clis/cli.py +++ b/src/haddock/clis/cli.py @@ -75,7 +75,7 @@ def cli(ap: ArgumentParser, main: Callable[..., None]) -> None: def maincli() -> None: """Execute main client.""" - cli(ap, main) + cli(_ap(), main) def main( @@ -117,6 +117,7 @@ def main( get_initial_greeting, gen_feedback_messages, ) + from haddock.gear.postprocessing import archive_run from haddock.gear.prepare_run import setup_run from haddock.libs.libio import working_directory from haddock.libs.liblog import ( @@ -181,10 +182,7 @@ def main( restart_step = restart WorkflowManager_ = WorkflowManager - with ( - working_directory(_run_dir), - log_error_and_exit(), - ): + with (working_directory(_run_dir), log_error_and_exit()): workflow = WorkflowManager_( workflow_params=params, start=restart_step, @@ -193,10 +191,20 @@ def main( # Main loop of execution workflow.run() + + # Run post-processing steps if other_params["postprocess"]: - workflow.postprocess() + workflow.postprocess(self_contained=other_params["gen_archive"]) + # Clean outputs workflow.clean() + # Generate archive of the run + if other_params["gen_archive"]: + _run_archive, _analysis_archive = archive_run(_run_dir) + log.info(f"Run archive created: {_run_archive}!") + if _analysis_archive: + log.info(f"Run analysis archive created: {_analysis_archive}") + # Finish end = time() elapsed = convert_seconds_to_min_sec(end - start) diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index 65ca24507..f0ef7e519 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -20,6 +20,7 @@ Where, ``-m 1 3`` means that the analysis will be performed on ``1_rigidbody`` and ``3_flexref``. """ + import argparse import os import shutil @@ -66,59 +67,6 @@ INTER_STR = INTERACTIVE_RE_SUFFIX # suffix of interactive analysis folders -def get_cluster_ranking( - capri_clt_filename: FilePath, - top_cluster: int, - ) -> ClRank: - """ - Get capri cluster ranking. - - Parameters - ---------- - capri_clt_filename : str or Path - capri cluster filename - top_cluster : int - Number of clusters to be considered - - Returns - ------- - cl_ranking : dict - {cluster_id : cluster_rank} dictionary - """ - cl_ranking: ClRank = {} - dfcl = read_capri_table(capri_clt_filename) - for n in range(min(top_cluster, dfcl.shape[0])): - cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n] - return cl_ranking - - -def update_paths( - capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../" -) -> None: - """ - Update paths in capri_ss_filename. - - Parameters - ---------- - capri_ss_filename : str or Path - capri ss filename - toch : str - string to be replaced - toadd : str - string to be added - """ - new_lines: list[str] = [] - with open(capri_ss_filename, "r") as rfile: - for ln in rfile: - new_ln = ln.replace(toch, toadd) - new_lines.append(new_ln) - - with open(capri_ss_filename, "w") as wfile: - for ln in new_lines: - wfile.write(ln) - return - - # Command line interface parser ap = argparse.ArgumentParser( prog="haddock3-analyse", @@ -205,6 +153,14 @@ def update_paths( default=1, ) +ap.add_argument( + "--self-contained", + help="Should the models be accessed locally?", + required=False, + default=False, + type=bool, +) + ap.add_argument( "-p", @@ -242,6 +198,59 @@ def maincli() -> None: cli(_ap(), main) +def get_cluster_ranking( + capri_clt_filename: FilePath, + top_cluster: int, + ) -> ClRank: + """ + Get capri cluster ranking. + + Parameters + ---------- + capri_clt_filename : str or Path + capri cluster filename + top_cluster : int + Number of clusters to be considered + + Returns + ------- + cl_ranking : dict + {cluster_id : cluster_rank} dictionary + """ + cl_ranking: ClRank = {} + dfcl = read_capri_table(capri_clt_filename) + for n in range(min(top_cluster, dfcl.shape[0])): + cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n] + return cl_ranking + + +def update_paths( + capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../" +) -> None: + """ + Update paths in capri_ss_filename. + + Parameters + ---------- + capri_ss_filename : str or Path + capri ss filename + toch : str + string to be replaced + toadd : str + string to be added + """ + new_lines: list[str] = [] + with open(capri_ss_filename, "r") as rfile: + for ln in rfile: + new_ln = ln.replace(toch, toadd) + new_lines.append(new_ln) + + with open(capri_ss_filename, "w") as wfile: + for ln in new_lines: + wfile.write(ln) + return + + def run_capri_analysis( step: str, run_dir: FilePath, @@ -356,67 +365,128 @@ def update_paths_in_capri_dict( return new_capri_dict -def zip_top_ranked( +def get_top_ranked_mapping( capri_filename: FilePath, cluster_ranking: ClRank, - summary_name: FilePath, - ) -> None: - """ - Zip the top ranked structures. + clustered_topX: int = 4, + unclustered_topX: int = 10, + ) -> dict[Path, str]: + # Set mapping of generated files + top_ranked_mapping: dict[Path, str] = {} - Parameters - ---------- - cluster_ranking : dict - {cluster_id : cluster_rank} dictionary - ss_file : str or Path - capri ss filename - - Returns - ------- - output_zipfile : str or Path - path to the zipped file - """ + # Read table capri_df = read_capri_table(capri_filename, comment="#") + # Group by clusters gb_cluster = capri_df.groupby("cluster_id") + + # Loop over clusters for cl_id, cl_df in gb_cluster: + # Filter only top clusters if cl_id in cluster_ranking.keys(): + # If clustered structure if cl_id != "-": - structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]] + # Retrieve only top 4 models per cluster + structs = cl_df.loc[cl_df["model-cluster_ranking"] <= clustered_topX][["model", "model-cluster_ranking"]] # noqa : E501 + # If un-clustered structures else: - structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]] + # Retrieve top 10 + structs = cl_df.loc[cl_df["caprieval_rank"] <= unclustered_topX][["model", "caprieval_rank"]] # noqa : E501 + # Rename columns to access them using same keywords structs.columns = ["model", "rank"] # iterate over the structures for _, row in structs.iterrows(): - struct = Path(row["model"]) - struct_gz = Path(f"{struct}.gz") + # Point rank rank = row["rank"] # set target name if cl_id != "-": - target_name = f"cluster_{cluster_ranking[cl_id]}_model_{rank}.pdb" + # Give it its cluster name + target_name = ( + f"cluster_{cluster_ranking[cl_id]}" + f"_model_{rank}.pdb" + ) else: + # Give it its rank name target_name = f"model_{rank}.pdb" + + # Generate structure path + struct = Path(row["model"]) + struct_gz = Path(f"{struct}.gz") # copy the structure if Path(struct).exists(): - shutil.copy(struct, Path(target_name)) + top_ranked_mapping[struct] = target_name elif struct_gz.exists(): - shutil.copy(struct_gz, ".") - # unpack the file - _unpack_gz(Path(".", struct_gz.name)) - shutil.move(struct.name, Path(target_name)) + top_ranked_mapping[struct_gz] = target_name else: log.warning(f"structure {struct} not found") + return top_ranked_mapping - # now make the archive and delete the pdb files - archive_files_ext(".", "pdb") - for file in Path(".").glob("*.pdb"): - file.unlink() - # move archive to summary - expected_archive = Path(".", "pdb.tgz") - if expected_archive.exists(): - shutil.move("pdb.tgz", summary_name) - log.info(f"Summary archive {summary_name} created!") +def zip_top_ranked( + top_ranked_mapping: dict[Path, str], + summary_name: str, + gen_archive: bool, + ) -> Optional[Path]: + """ + Zip the top ranked structures. + + Parameters + ---------- + capri_filename : str or Path + capri ss filename + cluster_ranking : dict + {cluster_id : cluster_rank} dictionary + summary_name: str + Base name of the archive to be generated + gen_archive: bool + Should the archive be generated? + clustered_topX: int + Number of models to access per cluster. Default is 4. + unclustered_topX: int + Number of models to access when no clusters. Default is 10. + + Return + ------ + output_fname : Optional[Path] + Path to the generated output. Can be a .tgz archive or a directory. + """ + for ori_fpath, new_name in top_ranked_mapping.items(): + # If already compressed + if ori_fpath.suffix == ".gz": + copied_fpath = shutil.copy(ori_fpath, ".") + # unpack the file + _unpack_gz(copied_fpath.name) + # Rename it + shutil.move(copied_fpath.name.replace(".gz", ""), new_name) + else: + shutil.copy(ori_fpath, new_name) + + # Compress pdb files + if gen_archive: + archive_was_created = archive_files_ext(".", "pdb") + # Delete the pdb files + for file_ in top_ranked_mapping.values(): + file_.unlink() + output_fname = Path(f"{summary_name}.tgz") + if archive_was_created: + # move archive to summary + shutil.move("pdb.tgz", output_fname) + log.info(f"Top structures summary archive {output_fname} created!") + return + else: + log.warning(f"Summary archive {output_fname} not created!") + return None + # Generate a directory holding all the structures else: - log.warning(f"Summary archive {summary_name} not created!") + output_fname = Path(summary_name) + output_fname.mkdir(parents=True, exist_ok=True) + for ori_fpath, new_name in top_ranked_mapping.items(): + # Create new path + next_filepath = Path(output_fname, str(new_name)) + # Hold it in mapping dict + top_ranked_mapping[ori_fpath] = str(next_filepath) + # Displace file + shutil.move(new_name, top_ranked_mapping[ori_fpath]) + log.info(f"Top structures copied into {output_fname}!") + return output_fname def analyse_step( @@ -431,6 +501,9 @@ def analyse_step( offline: bool = False, mode: str = "local", ncores: int = 4, + self_contained: bool = False, + clustered_topX: int = 4, + unclustered_topX: int = 10, ) -> None: """ Analyse a step. @@ -454,13 +527,31 @@ def analyse_step( Produce images in the selected format. scale : int scale for images. + is_cleaned: bool + is the directory going to be cleaned? + offline: bool + Should plots js functions be self-contained? + mode: str + mode of execution + ncores: int + number of cores to use + self_contained : bool + Should the analysis directory contain the models? + clustered_topX: int + Number of models to access per cluster. Default is 4. + unclustered_topX: int + Number of models to access when no clusters. Default is 10. """ log.info(f"Analysing step {step}") - + # Create directory target_path.mkdir(parents=True, exist_ok=False) + # Build caprieval output file names/paths + ss_filename = Path("capri_ss.tsv") + clt_filename = Path("capri_clt.tsv") step_name = step.split("_")[1] - ss_fname = Path(run_dir, f"{step}/capri_ss.tsv") - clt_fname = Path(run_dir, f"{step}/capri_clt.tsv") + ss_fname = Path(run_dir, f"{step}/{ss_filename}") + clt_fname = Path(run_dir, f"{step}/{clt_filename}") + # Search for caprieval output files if step_name != "caprieval": if ss_fname.exists() and clt_fname.exists(): log.info(f"step {step} has caprieval data, files are available") @@ -472,44 +563,62 @@ def analyse_step( log.info(f"step {step} is caprieval, files should be already available") run_capri = False - if run_capri == False: + # If caprieval data available, just copy them + if not run_capri: shutil.copy(ss_fname, target_path) shutil.copy(clt_fname, target_path) # Go to directory where to write all the analysis figures / report os.chdir(target_path) # if the step is not caprieval, caprieval must be run - if run_capri == True: + if run_capri: run_capri_analysis(step, run_dir, capri_dict, is_cleaned, mode, ncores) log.info("CAPRI files identified") # plotting - ss_file = Path("capri_ss.tsv") - clt_file = Path("capri_clt.tsv") - if clt_file.exists(): - cluster_ranking = get_cluster_ranking(clt_file, top_cluster) + if clt_filename.exists(): + cluster_ranking = get_cluster_ranking(clt_filename, top_cluster) else: - raise Exception(f"clustering file {clt_file} does not exist") - if ss_file.exists(): + raise Exception(f"clustering file {clt_filename} does not exist") + if ss_filename.exists(): + # Generate file mapping for top ranked structures + top_ranked_mapping = get_top_ranked_mapping( + ss_filename, + cluster_ranking, + clustered_topX=clustered_topX, + unclustered_topX=unclustered_topX, + ) + # provide a zipped archive of the top ranked structures + zip_top_ranked( + top_ranked_mapping, + "summary", + not self_contained, + ) log.info("Plotting results..") scatters = scatter_plot_handler( - ss_file, + ss_filename, cluster_ranking, format, scale, offline=offline, ) boxes = box_plot_handler( - ss_file, + ss_filename, cluster_ranking, format, scale, offline=offline, ) - tables = clt_table_handler(clt_file, ss_file, is_cleaned) + tables = clt_table_handler( + clt_filename, + ss_filename, + is_cleaned, + topX_clusters=top_cluster, + clustered_topX=clustered_topX, + unclustered_topX=unclustered_topX, + top_ranked_mapping=top_ranked_mapping if self_contained else None, + ) report_generator(boxes, scatters, tables, step, ".", offline) - # provide a zipped archive of the top ranked structures - zip_top_ranked(ss_file, cluster_ranking, Path("summary.tgz")) def validate_format(_format: Optional[ImgFormat]) -> Optional[ImgFormat]: @@ -570,6 +679,7 @@ def main( offline: bool = False, mode: Optional[str] = None, ncores: Optional[int] = None, + self_contained: bool = False, **kwargs: Any, ) -> None: """ @@ -579,33 +689,26 @@ def main( ---------- run_dir : str or Path Path to the original run directory. - modules : list of ints List of the integer prefix of the modules to copy. - top_cluster : int Number of clusters to be considered. - format : str Produce images in the selected format. - scale : int scale for images. - inter: bool analyse only steps labelled as 'interactive' - is_cleaned: bool is the directory going to be cleaned? - offline: bool Should plots js functions be self-contained? - mode: str mode of execution - ncores: int number of cores to use + self_contained : bool + Should the analysis directory contain the models? """ log.level = 20 log.info( @@ -646,7 +749,7 @@ def main( bad_folder_paths: list[Path] = [] for step in sel_steps: subfolder_name = f"{step}_analysis" - target_path = Path(Path("./"), subfolder_name) + target_path = Path(subfolder_name) # check if subfolder is already present dest_path = Path(ANA_FOLDER, subfolder_name) @@ -662,7 +765,6 @@ def main( shutil.rmtree(dest_path) # run the analysis - error = False try: analyse_step( step, @@ -676,14 +778,14 @@ def main( offline=offline, mode=mode, ncores=ncores, - ) + #self_contained=self_contained, + self_contained=True, + ) except Exception as e: - error = True log.warning( f"Could not execute the analysis for step {step}. " f"The following error occurred {e}" ) - if error: bad_folder_paths.append(target_path) else: good_folder_paths.append(target_path) @@ -694,8 +796,23 @@ def main( # moving files into analysis folder if good_folder_paths != []: log.info("moving files to analysis folder") + urls: list[str] = [] for directory in good_folder_paths: shutil.move(directory, outdir) + url = f"- [{Path(directory, 'report.html')}](http://0.0.0.0:8000/{Path(directory, 'report.html')}) " # noqa : E501 + urls.append(url) + + # Adding instructions on how to setup the server + readme_fpath = Path(outdir, "README.md") + readme_fpath.write_text( + f"# Usage{os.linesep}{os.linesep}" + "To view structures or download the structure files, " + f"in a terminal run the command:{os.linesep}```bash{os.linesep}" + f"python -m http.server --directory .{os.linesep}```{os.linesep}" + f"And open the link following links in a web browser:{os.linesep}" + f"{os.linesep.join(urls)}{os.linesep}" + ) + assert readme_fpath.exists() if bad_folder_paths != []: log.info("cancelling unsuccesful analysis folders") @@ -714,12 +831,12 @@ def main( log.info(f"View the results in {report_file}") info_msg = ( "To view structures or download the structure files, " - f"in a terminal run the command " - f"`python -m http.server --directory {rundir_cwd}`. " - "By default, http server runs on `http://0.0.0.0:8000/`. " - f"Open the link http://0.0.0.0:8000/{report_file} " + f"in a terminal run the command: {os.linesep}" + f">python -m http.server --directory {rundir_cwd}{os.linesep}" + # "By default, http server runs on `http://0.0.0.0:8000/`. " + f"And open the link http://0.0.0.0:8000/{report_file} " "in a web browser." - ) + ) log.info(info_msg) os.chdir(ori_cwd) return diff --git a/src/haddock/core/optional.yaml b/src/haddock/core/optional.yaml index 3e23e7f33..a07a3d9c1 100644 --- a/src/haddock/core/optional.yaml +++ b/src/haddock/core/optional.yaml @@ -21,4 +21,12 @@ postprocess: used to plot the results of a HADDOCK3 workflow. If this option, this command is automatically executed at the end of the workflow (on the caprieval folders). explevel: easy - +gen_archive: + default: false + type: boolean + title: Generates an archive of the run and of the analysis. + short: If true, executes haddock3-analyse in self_contained mode and generates + archives of the run_directory and of the analysis in two separated tgz files. + long: If true, executes haddock3-analyse in self_contained mode and generates + archives of the run_directory and of the analysis in two separated tgz files. + explevel: easy diff --git a/src/haddock/gear/postprocessing.py b/src/haddock/gear/postprocessing.py new file mode 100644 index 000000000..349dcbb6b --- /dev/null +++ b/src/haddock/gear/postprocessing.py @@ -0,0 +1,46 @@ +"""Tools for post-processing haddock3 runs.""" + +import os +import shutil +import tarfile + +from haddock import log +from haddock.clis.cli_analyse import ANA_FOLDER +from haddock.core.typing import Optional + + +def archive_run(run_dir: str, delete: bool = True) -> tuple[str, Optional[str]]: + """Create an archive of the haddock3 run directory and analysis. + + Parameters + ---------- + run_dir : str + Path to the run directory + delete : bool, optional + Should the un-archived directory be deleted?, by default False + + Returns + ------- + tuple[str, Optional[str]] + run_archive_fname : str + Path to the run archive + analysis_archive_fname : Optional[str] + Path to the run analysis archive + """ + log.info("Creating an archive of the run") + # Start by archiving the run_directory + run_archive_fname = f"{run_dir}.tgz" + with tarfile.open(run_archive_fname, "w:gz") as tar: + tar.add(run_dir, arcname=os.path.basename(run_dir)) + + # Archive the analysis directory + analysis_archive_fname = None + if os.path.exists(f"{run_dir}/{ANA_FOLDER}"): + analysis_archive_fname = f"{run_dir}_{ANA_FOLDER}.tgz" + with tarfile.open(analysis_archive_fname, "w:gz") as tar: + tar.add(f"{run_dir}/{ANA_FOLDER}", arcname=f"{run_dir}_{ANA_FOLDER}") + + if delete: + shutil.rmtree(run_dir) + + return run_archive_fname, analysis_archive_fname \ No newline at end of file diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py index 0747b2192..b85f2bae7 100644 --- a/src/haddock/libs/libplots.py +++ b/src/haddock/libs/libplots.py @@ -877,7 +877,15 @@ def create_other_cluster( return clusters_df, structs_df -def clt_table_handler(clt_file, ss_file, is_cleaned=False): +def clt_table_handler( + clt_file: FilePath, + ss_file: FilePath, + is_cleaned: bool = False, + topX_clusters: int = 10, + clustered_topX: int = 4, + unclustered_topX: int = 10, + top_ranked_mapping: Optional[dict[Path, str]] = None, + ) -> pd.DataFrame: """ Create a dataframe including data for tables. @@ -906,23 +914,34 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False): clusters_df = clusters_df.round(2) structs_df = structs_df.round(2) - # if the run will be cleaned, the structures are going to be gzipped - if is_cleaned: - # substitute the values in the df by adding .gz at the end - structs_df['model'] = structs_df['model'].replace( - to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True, - ) + if not top_ranked_mapping: + # if the run will be cleaned, the structures are going to be gzipped + if is_cleaned and not top_ranked_mapping: + # substitute the values in the df by adding .gz at the end + structs_df['model'] = structs_df['model'].replace( + to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True, + ) # ss_file is in NN_caprieval/ while report is in # analysis/NN_caprieval_analysis/ # need to correct model paths by prepending ../ - structs_df['model'] = structs_df['model'].apply(lambda x: f"../{x}") + def correct_relative_paths( + path: str, + top_ranked_mapping: Optional[dict[Path, str]], + ) -> str: + try: + new_path = top_ranked_mapping[Path(path)] + except KeyError: + new_path = f"../{path}" + return new_path + structs_df['model'] = structs_df['model'].apply( + lambda x: correct_relative_paths(x, top_ranked_mapping) + ) is_unclustered = clusters_df["cluster_rank"].unique().tolist() == ["-"] # If unclustered, we only want to show the top 10 structures in a table. if is_unclustered: - max_unstructured_structures = 10 - structs_df = structs_df[:max_unstructured_structures] + structs_df = structs_df[:unclustered_topX] cols2keep = ['caprieval_rank', 'model'] + list(AXIS_NAMES.keys()) structs_df = structs_df[cols2keep] # model has ../../01_rigidbody/rigidbody_62.pdb.gz @@ -933,11 +952,11 @@ def clt_table_handler(clt_file, ss_file, is_cleaned=False): clusters_df, structs_df = create_other_cluster( clusters_df, structs_df, - max_clusters=11, + max_clusters=topX_clusters + 1, ) clusters_df = clean_capri_table(clusters_df) - structs_df = find_best_struct(structs_df, max_best_structs=4) + structs_df = find_best_struct(structs_df, max_best_structs=clustered_topX) df_merged = pd.merge(clusters_df, structs_df, on="cluster_id") return df_merged diff --git a/src/haddock/libs/libworkflow.py b/src/haddock/libs/libworkflow.py index ca2c45ccb..e789b79f3 100644 --- a/src/haddock/libs/libworkflow.py +++ b/src/haddock/libs/libworkflow.py @@ -59,7 +59,7 @@ def clean(self, terminated: Optional[int] = None) -> None: for step in self.recipe.steps[:terminated]: step.clean() - def postprocess(self) -> None: + def postprocess(self, self_contained: bool = False) -> None: """Postprocess the workflow.""" # is the workflow going to be cleaned? is_cleaned = self.recipe.steps[0].config['clean'] @@ -69,14 +69,25 @@ def postprocess(self) -> None: mode = self.recipe.steps[0].config['mode'] # ncores ncores = self.recipe.steps[0].config['ncores'] - + capri_steps: list[int] = [] for step in self.recipe.steps: if step.module_name == "caprieval": capri_steps.append(step.order) # type: ignore # call cli_analyse (no need for capri_dicts, it's all precalculated) - cli_analyse("./", capri_steps, top_cluster=10, format=None, scale=None, - inter=False, is_cleaned=is_cleaned, offline=offline, mode=mode, ncores=ncores) + cli_analyse( + "./", + capri_steps, + top_cluster=10, + format=None, + scale=None, + inter=False, + is_cleaned=is_cleaned, + offline=offline, + mode=mode, + ncores=ncores, + self_contained=self_contained, + ) # call cli_traceback. If it fails, it's not a big deal try: cli_traceback("./", offline=offline) From 3ea287d3a1ed9cf03ce5e28ca64eff9fc06a5a67 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Fri, 1 Nov 2024 16:23:12 +0100 Subject: [PATCH 4/8] update test --- src/haddock/clis/cli_analyse.py | 2 +- tests/test_cli_analyse.py | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index f0ef7e519..9baee2ca5 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -464,7 +464,7 @@ def zip_top_ranked( archive_was_created = archive_files_ext(".", "pdb") # Delete the pdb files for file_ in top_ranked_mapping.values(): - file_.unlink() + Path(file_).unlink() output_fname = Path(f"{summary_name}.tgz") if archive_was_created: # move archive to summary diff --git a/tests/test_cli_analyse.py b/tests/test_cli_analyse.py index bf5d4325c..224ec3d9f 100644 --- a/tests/test_cli_analyse.py +++ b/tests/test_cli_analyse.py @@ -8,6 +8,7 @@ from haddock.clis.cli_analyse import ( get_cluster_ranking, + get_top_ranked_mapping, main, update_capri_dict, zip_top_ranked, @@ -51,11 +52,13 @@ def test_update_capri_dict(default_capri): def test_get_cluster_ranking(example_capri_clt): """Test get_cluster_ranking.""" obs_cl_ranking = get_cluster_ranking(example_capri_clt, 5) - exp_cl_ranking = {16: 1, - 1: 2, - 13: 3, - 4: 4, - 5: 5} + exp_cl_ranking = { + 16: 1, + 1: 2, + 13: 3, + 4: 4, + 5: 5, + } assert exp_cl_ranking == obs_cl_ranking @@ -109,9 +112,20 @@ def test_zip_top_ranked(example_capri_ss, monkeypatch): monkeypatch.chdir(rigid_dir_analysis) exp_cl_ranking = {1: 2} - zip_top_ranked(example_capri_ss, exp_cl_ranking, "summary.tgz") + top_ranked_mapping = get_top_ranked_mapping( + example_capri_ss, + exp_cl_ranking, + ) + # Archive version + zip_top_ranked(top_ranked_mapping, "summary", True) assert os.path.isfile("summary.tgz") is True + # Non-Archived version + zip_top_ranked(top_ranked_mapping, "notarchived", False) + assert not os.path.isfile("notarchived.tgz") + assert os.path.isdir("notarchived") + assert len(list(Path("notarchived").glob("*.pdb"))) > 0 + def test_main_offline(example_capri_ss, example_capri_clt, tmp_path): """Test cli_analyse main in offline mode.""" From 5570d4414816567cde9fd83c1fc8f50b230a624c Mon Sep 17 00:00:00 2001 From: Victor Reys <132575181+VGPReys@users.noreply.github.com> Date: Fri, 6 Dec 2024 09:06:44 +0100 Subject: [PATCH 5/8] Update src/haddock/gear/greetings.py --- src/haddock/gear/greetings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/gear/greetings.py b/src/haddock/gear/greetings.py index 9565c5bd2..e34869b93 100644 --- a/src/haddock/gear/greetings.py +++ b/src/haddock/gear/greetings.py @@ -30,7 +30,7 @@ # Do not hesitate to update / comment one of these feedback_urls = { "GitHub issues": "https://github.com/haddocking/haddock3/issues", - "BioExcel feedback": "https://www.bonvinlab.org/feedback" + "BioExcel feedback": "https://www.bonvinlab.org/feedback", # "BioExcel survey": "https://bioexcel.eu/bioexcel-survey-2024/", "BioExcel forum": "https://ask.bioexcel.eu/c/haddock/6", } From 03d01431dc438eca0198e310cd4fe462f6650b51 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Mon, 27 Jan 2025 16:51:47 +0100 Subject: [PATCH 6/8] fix deprecation warning on regex using backslash --- src/haddock/modules/topology/topoaa/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/haddock/modules/topology/topoaa/__init__.py b/src/haddock/modules/topology/topoaa/__init__.py index 22373dc26..ad3596074 100644 --- a/src/haddock/modules/topology/topoaa/__init__.py +++ b/src/haddock/modules/topology/topoaa/__init__.py @@ -160,8 +160,8 @@ def get_ensemble_origin(ensemble_f: FilePath) -> dict[int, str]: lines = text.split(os.linesep) REMARK_lines = (line for line in lines if line.startswith("REMARK")) re_origin = re.compile( - "REMARK\s+MODEL\s+(\d+)\s+(FROM|from|From)\s+(([\w_-]+\.?)+)" - ) # noqa : E501 + r"REMARK\s+MODEL\s+(\d+)\s+(FROM|from|From)\s+(([\w_-]+\.?)+)" + ) for line in REMARK_lines: if match := re_origin.search(line): model_num = int(match.group(1).strip()) From 4d680635f230591e2cee324fe0441ff47d166ef2 Mon Sep 17 00:00:00 2001 From: VGPReys Date: Wed, 29 Jan 2025 10:52:00 +0100 Subject: [PATCH 7/8] fix analyse on interactive steps --- integration_tests/test_full_workflow.py | 28 ++++++++++------- src/haddock/clis/cli_analyse.py | 21 ++++++------- src/haddock/libs/libplots.py | 41 +++++++++++++------------ 3 files changed, 48 insertions(+), 42 deletions(-) diff --git a/integration_tests/test_full_workflow.py b/integration_tests/test_full_workflow.py index ad351ba1f..51aeb6278 100644 --- a/integration_tests/test_full_workflow.py +++ b/integration_tests/test_full_workflow.py @@ -2,8 +2,13 @@ from pathlib import Path import os import shutil -from haddock.libs.libworkflow import WorkflowManager + +from haddock.clis.cli import main as cli_main +from haddock.clis.cli_analyse import main as cli_analyse +from haddock.clis.cli_re import maincli from haddock.core.typing import Any +from haddock.libs.libworkflow import WorkflowManager + from integration_tests import GOLDEN_DATA @@ -66,7 +71,7 @@ def test_interactive_analysis_on_workflow(monkeypatch): monkeypatch.chdir(tmpdir) - from haddock.clis.cli import main as cli_main + cli_main( Path("workflow.cfg"), ) @@ -85,7 +90,7 @@ def test_interactive_analysis_on_workflow(monkeypatch): # now running interactive re-clustering clustfcc_dir = f"{run_dir}/2_clustfcc" - from haddock.clis.cli_re import maincli + # faking sys.argv in input to haddock3-re monkeypatch.setattr("sys.argv", ["haddock3-re", "clustfcc", clustfcc_dir, "-f", "0.7"] @@ -106,14 +111,15 @@ def test_interactive_analysis_on_workflow(monkeypatch): assert Path(run_dir, "3_caprieval_interactive/capri_ss.tsv").exists() is True # now analyse the interactive folders - from haddock.clis.cli_analyse import main as cli_analyse - cli_analyse(run_dir, - [2,3], - 10, - format=None, - scale=None, - is_cleaned=True, - inter=True) + cli_analyse( + run_dir, + [2, 3], + 10, + format=None, + scale=None, + is_cleaned=True, + inter=True, + ) exp_clustfcc_dir = Path(run_dir, "analysis", "2_clustfcc_interactive_analysis") exp_caprieval_dir = Path(run_dir, "analysis", "3_caprieval_interactive_analysis") assert os.path.isdir(exp_clustfcc_dir) is True diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index a2df10255..1b529d88d 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -370,9 +370,9 @@ def get_top_ranked_mapping( cluster_ranking: ClRank, clustered_topX: int = 4, unclustered_topX: int = 10, - ) -> dict[Path, str]: + ) -> dict[Path, Path]: # Set mapping of generated files - top_ranked_mapping: dict[Path, str] = {} + top_ranked_mapping: dict[Path, Path] = {} # Read table capri_df = read_capri_table(capri_filename, comment="#") @@ -413,15 +413,15 @@ def get_top_ranked_mapping( struct_gz = Path(f"{struct}.gz") # copy the structure if Path(struct).exists(): - top_ranked_mapping[struct] = target_name + top_ranked_mapping[struct] = Path(target_name) elif struct_gz.exists(): - top_ranked_mapping[struct_gz] = target_name + top_ranked_mapping[struct_gz] = Path(target_name) else: log.warning(f"structure {struct} not found") return top_ranked_mapping def zip_top_ranked( - top_ranked_mapping: dict[Path, str], + top_ranked_mapping: dict[Path, Path], summary_name: str, gen_archive: bool, ) -> Optional[Path]: @@ -451,9 +451,9 @@ def zip_top_ranked( for ori_fpath, new_name in top_ranked_mapping.items(): # If already compressed if ori_fpath.suffix == ".gz": - copied_fpath = shutil.copy(ori_fpath, ".") + copied_fpath = Path(shutil.copy(ori_fpath, ".")) # unpack the file - _unpack_gz(copied_fpath.name) + _unpack_gz(copied_fpath) # Rename it shutil.move(copied_fpath.name.replace(".gz", ""), new_name) else: @@ -482,7 +482,7 @@ def zip_top_ranked( # Create new path next_filepath = Path(output_fname, str(new_name)) # Hold it in mapping dict - top_ranked_mapping[ori_fpath] = str(next_filepath) + top_ranked_mapping[ori_fpath] = Path(next_filepath) # Displace file shutil.move(new_name, top_ranked_mapping[ori_fpath]) log.info(f"Top structures copied into {output_fname}!") @@ -778,13 +778,12 @@ def main( offline=offline, mode=mode, ncores=ncores, - #self_contained=self_contained, - self_contained=True, + self_contained=self_contained, ) except Exception as e: log.warning( f"Could not execute the analysis for step {step}. " - f"The following error occurred {e}" + f"The following error occurred: {e}" ) bad_folder_paths.append(target_path) else: diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py index b85f2bae7..18e3bcd46 100644 --- a/src/haddock/libs/libplots.py +++ b/src/haddock/libs/libplots.py @@ -884,7 +884,7 @@ def clt_table_handler( topX_clusters: int = 10, clustered_topX: int = 4, unclustered_topX: int = 10, - top_ranked_mapping: Optional[dict[Path, str]] = None, + top_ranked_mapping: Optional[dict[Path, Path]] = None, ) -> pd.DataFrame: """ Create a dataframe including data for tables. @@ -914,29 +914,30 @@ def clt_table_handler( clusters_df = clusters_df.round(2) structs_df = structs_df.round(2) + # if the run will be cleaned, the structures are going to be gzipped if not top_ranked_mapping: - # if the run will be cleaned, the structures are going to be gzipped - if is_cleaned and not top_ranked_mapping: + if is_cleaned: # substitute the values in the df by adding .gz at the end structs_df['model'] = structs_df['model'].replace( to_replace=r"(\.pdb)$", value=r".pdb.gz", regex=True, - ) - - # ss_file is in NN_caprieval/ while report is in - # analysis/NN_caprieval_analysis/ - # need to correct model paths by prepending ../ - def correct_relative_paths( - path: str, - top_ranked_mapping: Optional[dict[Path, str]], - ) -> str: - try: - new_path = top_ranked_mapping[Path(path)] - except KeyError: - new_path = f"../{path}" - return new_path - structs_df['model'] = structs_df['model'].apply( - lambda x: correct_relative_paths(x, top_ranked_mapping) - ) + ) + else: + # ss_file is in NN_caprieval/ while report is in + # analysis/NN_caprieval_analysis/ + # need to correct model paths by prepending ../ + def correct_relative_paths( + path: str, + top_ranked_mapping: Optional[dict[Path, Path]], + ) -> str: + try: + new_path = top_ranked_mapping[path] + except (KeyError, TypeError, ): + new_path = f"../{path}" + return new_path + + structs_df['model'] = structs_df['model'].apply( + lambda x: correct_relative_paths(x, top_ranked_mapping) + ) is_unclustered = clusters_df["cluster_rank"].unique().tolist() == ["-"] # If unclustered, we only want to show the top 10 structures in a table. From 3d1920c1806725ddd0a9ca271911d8605203cf08 Mon Sep 17 00:00:00 2001 From: Marco Giulini <54807167+mgiulini@users.noreply.github.com> Date: Thu, 30 Jan 2025 17:33:37 +0100 Subject: [PATCH 8/8] Update src/haddock/clis/cli_analyse.py Co-authored-by: Victor Reys <132575181+VGPReys@users.noreply.github.com> --- src/haddock/clis/cli_analyse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index 1b529d88d..c306a30d7 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -155,7 +155,7 @@ ap.add_argument( "--self-contained", - help="Should the models be accessed locally?", + help="If self-contained is set, models will be copied locally in the analysis directory, allowing to visualize structures outside of the haddock3 run.", required=False, default=False, type=bool,