From 39e9cce778708411e86e3f8362b71a518b308ab0 Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 13:02:55 +0200 Subject: [PATCH 01/23] Update visualization_functions.py added type checking and hinting --- .../visualization_functions.py | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py index f7981118..4b4f8441 100644 --- a/openmmdl/openmmdl_analysis/visualization_functions.py +++ b/openmmdl/openmmdl_analysis/visualization_functions.py @@ -6,30 +6,31 @@ import subprocess import os import shutil +from typing import List, Dict, Any, Optional, Union from openmmdl.openmmdl_analysis.barcode_generation import BarcodeGenerator class TrajectorySaver: - def __init__(self, pdb_md, ligname, special, nucleic): - """Initializes the TrajectorySaver with a mda.Universe object, ligand name, special residue name and if receptor is nucleic. + def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool) -> None: + """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type. Args: - pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory - ligname (str): name of the ligand in the pdb file - special (str): name of the special residue/ligand in the pdb file (e.g. HEM) - nucleic (bool): True if receptor is nucleic, False otherwise + pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory. + ligname (str): Name of the ligand in the pdb file. + special (str): Name of the special residue/ligand in the pdb file (e.g., HEM). + nucleic (bool): True if the receptor is nucleic, False otherwise. """ self.pdb_md = pdb_md self.ligname = ligname self.special = special self.nucleic = nucleic - def save_interacting_waters_trajectory(self, interacting_waters, outputpath): - """Saves .pdb and .dcd files of the trajectory containing ligand, receptor and all interacting waters. + def save_interacting_waters_trajectory(self, interacting_waters: List[int], outputpath: str = './Visualization/') -> None: + """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters. Args: - interacting_waters (list): list of all interacting water ids - outputpath (str, optional): filepath to output new pdb and dcd files. Defaults to './Visualization/'. + interacting_waters (List[int]): List of all interacting water IDs. + outputpath (str, optional): Filepath to output new pdb and dcd files. Defaults to './Visualization/'. """ water_atoms = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.ligname} or resname {self.special}" @@ -47,13 +48,13 @@ def save_interacting_waters_trajectory(self, interacting_waters, outputpath): for ts in self.pdb_md.trajectory: W.write(water_atoms) - def save_frame(self, frame, outpath, selection=False): + def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None) -> None: """Saves a single frame of the trajectory. Args: - frame (int): Number of the frame to save - outpath (str): Path to save the frame to - selection (str, optional): MDAnalysis selection string. Defaults to False. + frame (int): Number of the frame to save. + outpath (str): Path to save the frame to. + selection (Optional[str], optional): MDAnalysis selection string. Defaults to None. """ self.pdb_md.trajectory[frame] if selection: @@ -64,30 +65,37 @@ def save_frame(self, frame, outpath, selection=False): class Visualizer: - def __init__(self, md, cloud_path, ligname, special): + def __init__(self, md: mda.Universe, cloud_path: str, ligname: str, special: Optional[str] = None) -> None: self.md = md self.cloud = self.load_cloud(cloud_path) self.ligname = ligname self.special = special - def load_cloud(self, cloud_path): + def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float], List[int]]]]: + """Loads interaction cloud data from a JSON file. + + Args: + cloud_path (str): Path to the cloud data JSON file. + + Returns: + Dict[str, Dict[str, Union[List[float], List[int]]]]: The loaded cloud data. + """ with open(cloud_path, "r") as f: data = json.load(f) return data def visualize( - self, receptor_type="protein or nucleic", height="1000px", width="1000px" - ): + self, receptor_type: str = "protein or nucleic", height: str = "1000px", width: str = "1000px" + ) -> nv.NGLWidget: """Generates visualization of the trajectory with the interacting waters and interaction clouds. Args: - ligname (str): name of the ligand in the pdb file - receptor_type (str, optional): type of receptor. Defaults to 'protein or nucleic'. - height (str, optional): height of the visualization. Defaults to '1000px'. - width (str, optional): width of the visualization. Defaults to '1000px'. + receptor_type (str, optional): Type of receptor. Defaults to 'protein or nucleic'. + height (str, optional): Height of the visualization. Defaults to '1000px'. + width (str, optional): Width of the visualization. Defaults to '1000px'. Returns: - nglview widget: returns an nglview.widget object containing the visualization + nv.NGLWidget: Returns an nglview.widget object containing the visualization. """ sphere_buffers = [] @@ -141,8 +149,8 @@ def visualize( return view -def run_visualization(): - """Runs the visualization notebook in the current directory. The visualization notebook is copied from the package directory to the current directory and automaticaly started.""" +def run_visualization() -> None: + """Runs the visualization notebook in the current directory. The visualization notebook is copied from the package directory to the current directory and automatically started.""" package_dir = os.path.dirname(__file__) notebook_path = os.path.join(package_dir, "visualization.ipynb") current_dir = os.getcwd() From 4b8e11ac5dfe1596c87a725ebb43df0226147a5a Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 13:27:07 +0200 Subject: [PATCH 02/23] Update rmsd_calculation.py added type checking and hinting --- .../openmmdl_analysis/rmsd_calculation.py | 73 +++++++++---------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py index 60a650da..cbcc4505 100644 --- a/openmmdl/openmmdl_analysis/rmsd_calculation.py +++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py @@ -7,10 +7,11 @@ import MDAnalysis as mda from MDAnalysis.analysis import rms, diffusionmap from MDAnalysis.analysis.distances import dist +from typing import Optional, List, Tuple, Union @jit(nopython=True, parallel=True, nogil=True) -def calc_rmsd_2frames_jit(ref, frame): +def calc_rmsd_2frames_jit(ref: np.ndarray, frame: np.ndarray) -> float: dist = np.zeros(len(frame)) for atom in range(len(frame)): dist[atom] = ( @@ -23,12 +24,14 @@ def calc_rmsd_2frames_jit(ref, frame): class RMSDAnalyzer: - def __init__(self, prot_lig_top_file, prot_lig_traj_file): - self.prot_lig_top_file = prot_lig_top_file - self.prot_lig_traj_file = prot_lig_traj_file - self.universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file) - - def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None): + def __init__(self, prot_lig_top_file: str, prot_lig_traj_file: str) -> None: + self.prot_lig_top_file: str = prot_lig_top_file + self.prot_lig_traj_file: str = prot_lig_traj_file + self.universe: mda.Universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file) + + def rmsd_for_atomgroups( + self, fig_type: str, selection1: str, selection2: Optional[List[str]] = None + ) -> pd.DataFrame: """Calculate the RMSD for selected atom groups, and save the csv file and plot. Args: @@ -45,7 +48,7 @@ def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None): self.universe, ref, select=selection1, groupselections=selection2 ) rmsd_analysis.run() - columns = [selection1, *selection2] if selection2 else [selection1] + columns = [selection1] + selection2 if selection2 else [selection1] rmsd_df = pd.DataFrame(np.round(rmsd_analysis.rmsd[:, 2:], 2), columns=columns) rmsd_df.index.name = "frame" @@ -54,16 +57,18 @@ def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None): os.makedirs(output_directory, exist_ok=True) # Save the RMSD values to a CSV file in the created directory - rmsd_df.to_csv("./RMSD/RMSD_over_time.csv", sep=" ") + rmsd_df.to_csv(f"{output_directory}/RMSD_over_time.csv", sep=" ") # Plot and save the RMSD over time as a PNG file rmsd_df.plot(title="RMSD of protein and ligand") plt.ylabel("RMSD (Å)") - plt.savefig(f"./RMSD/RMSD_over_time.{fig_type}") + plt.savefig(f"{output_directory}/RMSD_over_time.{fig_type}") return rmsd_df - def rmsd_dist_frames(self, fig_type, lig, nucleic=False): + def rmsd_dist_frames( + self, fig_type: str, lig: str, nucleic: bool = False + ) -> Tuple[np.ndarray, np.ndarray]: """Calculate the RMSD between all frames in a matrix. Args: @@ -76,24 +81,24 @@ def rmsd_dist_frames(self, fig_type, lig, nucleic=False): np.array: pairwise_rmsd_lig. Numpy array of RMSD values for ligand structures. """ if nucleic: - pairwise_rmsd_prot = ( + pairwise_rmsd_prot: np.ndarray = ( diffusionmap.DistanceMatrix(self.universe, select="nucleic") .run() .dist_matrix ) else: - pairwise_rmsd_prot = ( + pairwise_rmsd_prot: np.ndarray = ( diffusionmap.DistanceMatrix(self.universe, select="protein") .run() .dist_matrix ) - pairwise_rmsd_lig = ( + pairwise_rmsd_lig: np.ndarray = ( diffusionmap.DistanceMatrix(self.universe, f"resname {lig}") .run() .dist_matrix ) - max_dist = max(np.amax(pairwise_rmsd_lig), np.amax(pairwise_rmsd_prot)) + max_dist: float = max(np.amax(pairwise_rmsd_lig), np.amax(pairwise_rmsd_prot)) fig, ax = plt.subplots(1, 2) fig.suptitle("RMSD between the frames") @@ -119,14 +124,14 @@ def rmsd_dist_frames(self, fig_type, lig, nucleic=False): plt.savefig(f"./RMSD/RMSD_between_the_frames.{fig_type}") return pairwise_rmsd_prot, pairwise_rmsd_lig - def calc_rmsd_2frames(self, ref, frame): + def calc_rmsd_2frames(self, ref: np.ndarray, frame: np.ndarray) -> float: """ RMSD calculation between a reference and a frame. """ return calc_rmsd_2frames_jit(ref, frame) - def calculate_distance_matrix(self, selection): - distances = np.zeros( + def calculate_distance_matrix(self, selection: str) -> np.ndarray: + distances: np.ndarray = np.zeros( (len(self.universe.trajectory), len(self.universe.trajectory)) ) # calculate distance matrix @@ -135,44 +140,36 @@ def calculate_distance_matrix(self, selection): desc="\033[1mCalculating distance matrix:\033[0m", ): self.universe.trajectory[i] - frame_i = self.universe.select_atoms(selection).positions - # distances[i] = md.rmsd(traj_aligned, traj_aligned, frame=i) + frame_i: np.ndarray = self.universe.select_atoms(selection).positions for j in range(i + 1, len(self.universe.trajectory)): self.universe.trajectory[j] - frame_j = self.universe.select_atoms(selection).positions - rmsd = self.calc_rmsd_2frames(frame_i, frame_j) + frame_j: np.ndarray = self.universe.select_atoms(selection).positions + rmsd: float = self.calc_rmsd_2frames(frame_i, frame_j) distances[i][j] = rmsd distances[j][i] = rmsd return distances - def calculate_representative_frame(self, bmode_frames, DM): - """Calculates the most representative frame for a bindingmode. This is based uppon the averagwe RMSD of a frame to all other frames in the binding mode. + def calculate_representative_frame( + self, bmode_frames: List[int], DM: np.ndarray + ) -> int: + """Calculates the most representative frame for a binding mode. This is based upon the average RMSD of a frame to all other frames in the binding mode. Args: - bmode_frame_list (list): List of frames belonging to a binding mode. + bmode_frames (list): List of frames belonging to a binding mode. DM (np.array): Distance matrix of trajectory. Returns: int: Number of the most representative frame. """ - frames = bmode_frames - mean_rmsd_per_frame = {} - # first loop : first frame + frames: List[int] = bmode_frames + mean_rmsd_per_frame: dict = {} for frame_i in frames: mean_rmsd_per_frame[frame_i] = 0 - # we will add the rmsd between theses 2 frames and then calcul the - # mean for frame_j in frames: - # We don't want to calcul the same frame. - if not frame_j == frame_i: - # we add to the corresponding value in the list of all rmsd - # the RMSD betwween frame_i and frame_j + if frame_j != frame_i: mean_rmsd_per_frame[frame_i] += DM[frame_i - 1, frame_j - 1] - # mean calculation mean_rmsd_per_frame[frame_i] /= len(frames) - # Representative frame = frame with lower RMSD between all other - # frame of the cluster - repre = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get) + repre: int = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get) return repre From 4193611a02a73bd47bfb92edb8b2ded4eff88e69 Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 13:55:28 +0200 Subject: [PATCH 03/23] Update rdkit_figure_generation.py added type checking and type hinting --- .../rdkit_figure_generation.py | 337 +++++++++--------- 1 file changed, 162 insertions(+), 175 deletions(-) diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py index e435f9a2..d630f30a 100644 --- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py +++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py @@ -12,13 +12,13 @@ class LigandImageGenerator: def __init__( self, - ligand_name, - complex_pdb_file, - ligand_no_h_pdb_file, - smiles_file, - output_svg_filename, - fig_type="svg", - ): + ligand_name: str, + complex_pdb_file: str, + ligand_no_h_pdb_file: str, + smiles_file: str, + output_svg_filename: str, + fig_type: str = "svg", + ) -> None: """ Initialize the LigandImageGenerator class. @@ -30,14 +30,14 @@ def __init__( output_svg_filename (str): Name of the output SVG file. fig_type (str): Type of the output figure. Can be "svg" or "png". """ - self.ligand_name = ligand_name - self.complex_pdb_file = complex_pdb_file - self.ligand_no_h_pdb_file = ligand_no_h_pdb_file - self.smiles_file = smiles_file - self.output_svg_filename = output_svg_filename - self.fig_type = fig_type - - def generate_image(self): + self.ligand_name: str = ligand_name + self.complex_pdb_file: str = complex_pdb_file + self.ligand_no_h_pdb_file: str = ligand_no_h_pdb_file + self.smiles_file: str = smiles_file + self.output_svg_filename: str = output_svg_filename + self.fig_type: str = fig_type + + def generate_image(self) -> None: """Generates an SVG image (or PNG) of the ligand.""" try: # Load complex and ligand structures @@ -47,31 +47,31 @@ def generate_image(self): complex_lig = complex.select_atoms(f"resname {self.ligand_name}") # Load ligand from PDB file - mol = Chem.MolFromPDBFile(self.ligand_no_h_pdb_file) - lig_rd = mol + mol: Chem.Mol = Chem.MolFromPDBFile(self.ligand_no_h_pdb_file) + lig_rd: Chem.Mol = mol # Load reference SMILES with open(self.smiles_file, "r") as file: - reference_smiles = file.read().strip() - reference_mol = Chem.MolFromSmiles(reference_smiles) + reference_smiles: str = file.read().strip() + reference_mol: Chem.Mol = Chem.MolFromSmiles(reference_smiles) # Prepare ligand - prepared_ligand = AllChem.AssignBondOrdersFromTemplate( + prepared_ligand: Chem.Mol = AllChem.AssignBondOrdersFromTemplate( reference_mol, lig_rd ) AllChem.Compute2DCoords(prepared_ligand) # Map atom indices between ligand_no_h and complex for atom in prepared_ligand.GetAtoms(): - atom_index = atom.GetIdx() + atom_index: int = atom.GetIdx() for lig_atom in lig_noh: - lig_index = lig_atom.index + lig_index: int = lig_atom.index if atom_index == lig_index: - lig_atom_name = lig_atom.name + lig_atom_name: str = lig_atom.name for comp_lig in complex_lig: - comp_lig_name = comp_lig.name + comp_lig_name: str = comp_lig.name if lig_atom_name == comp_lig_name: - num = int(comp_lig.id) + num: int = int(comp_lig.id) atom.SetAtomMapNum(num) # Generate an SVG image of the ligand @@ -82,13 +82,13 @@ def generate_image(self): drawer.DrawMolecule(prepared_ligand) # Adjust font size in the SVG output using the FontSize method - font_size = drawer.FontSize() + font_size: float = drawer.FontSize() drawer.SetFontSize( font_size * 0.5 ) # You can adjust the multiplier as needed drawer.FinishDrawing() - svg = drawer.GetDrawingText().replace("svg:", "") + svg: str = drawer.GetDrawingText().replace("svg:", "") # Save the SVG image to the specified output file with open(self.output_svg_filename, "w") as f: @@ -96,7 +96,7 @@ def generate_image(self): # Convert to PNG if requested if self.fig_type == "png": - png_filename = self.output_svg_filename.replace(".svg", ".png") + png_filename: str = self.output_svg_filename.replace(".svg", ".png") cairosvg.svg2png(url=self.output_svg_filename, write_to=png_filename) print(f"PNG image saved as: {png_filename}") @@ -104,8 +104,11 @@ def generate_image(self): print(f"Error: {e}") +import MDAnalysis as mda +from typing import List, Dict, Tuple + class InteractionProcessor: - def __init__(self, complex_pdb_file, ligand_no_h_pdb_file): + def __init__(self, complex_pdb_file: str, ligand_no_h_pdb_file: str): """ Initialize the InteractionProcessor class. @@ -119,14 +122,14 @@ def __init__(self, complex_pdb_file, ligand_no_h_pdb_file): self.ligand_no_h = mda.Universe(ligand_no_h_pdb_file) self.lig_noh = self.ligand_no_h.select_atoms("all") - def split_interaction_data(self, data): + def split_interaction_data(self, data: List[str]) -> List[str]: """Splits the input data into multiple parts. Args: - data (list): A list of ResNr and ResType, Atom indices, interaction type that needs to be split. + data (List[str]): A list of ResNr and ResType, Atom indices, interaction type that needs to be split. Returns: - list: A new list of the interaction data that consists of three parts. + List[str]: A new list of the interaction data that consists of three parts. """ split_data = [] for item in data: @@ -138,28 +141,31 @@ def split_interaction_data(self, data): split_data.append(split_value) return split_data - def highlight_numbers(self, split_data, starting_idx): + def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> Tuple[ + List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int] + ]: """Extracts the data from the split_data output of the interactions and categorizes it to its respective list. Args: - split_data (list): A list of interaction data items, where each item contains information about protein partner name, + split_data (List[str]): A list of interaction data items, where each item contains information about protein partner name, numeric codes and interaction type. - starting_idx (list): Starting index of the ligand atom indices used for identifying the correct atom to highlight. + starting_idx (List[int]): Starting index of the ligand atom indices used for identifying the correct atom to highlight. Returns: - tuple: A tuple that contains list of all of the highlighted atoms of all of the interactions. + Tuple[List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int]]: + A tuple that contains list of all of the highlighted atoms of all of the interactions. """ - highlighted_hbond_acceptor = [] - highlighted_hbond_donor = [] - highlighted_hydrophobic = [] - highlighted_hbond_both = [] - highlighted_waterbridge = [] - highlighted_pistacking = [] - highlighted_halogen = [] - highlighted_ni = [] - highlighted_pi = [] - highlighted_pication = [] - highlighted_metal = [] + highlighted_hbond_acceptor: List[int] = [] + highlighted_hbond_donor: List[int] = [] + highlighted_hydrophobic: List[int] = [] + highlighted_hbond_both: List[int] = [] + highlighted_waterbridge: List[int] = [] + highlighted_pistacking: List[int] = [] + highlighted_halogen: List[int] = [] + highlighted_ni: List[int] = [] + highlighted_pi: List[int] = [] + highlighted_pication: List[int] = [] + highlighted_metal: List[int] = [] for item in split_data: parts = item.split() @@ -167,119 +173,83 @@ def highlight_numbers(self, split_data, starting_idx): numeric_codes = parts[1:-1] interaction_type = parts[-1] - if interaction_type == "hbond": - parts = item.split() - protein_partner_name = parts[0] - numeric_codes = parts[1:-2] - type = parts[-2] - interaction_type = parts[-1] - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + atom_name = complex_id[0].name if len(complex_id) > 0 else None + + if atom_name is None: + continue + + lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") + lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + + if lig_real_index is None: + continue + + if interaction_type == "hbond": + type = parts[-2] if type == "Donor": - highlighted_hbond_donor.append(lig_real_index - 1) + highlighted_hbond_donor.append(lig_real_index) elif type == "Acceptor": - highlighted_hbond_acceptor.append(lig_real_index - 1) - - elif interaction_type == "hydrophobic": - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_hydrophobic.append(lig_real_index - 1) - - elif interaction_type == "waterbridge": - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_waterbridge.append(lig_real_index - 1) - - elif interaction_type == "pistacking": - split_codes = numeric_codes[0].split(",") - for code in split_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_pistacking.append(lig_real_index - 1) - - elif interaction_type == "halogen": - numeric_codes = parts[1:-2] - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_halogen.append(lig_real_index - 1) - - elif interaction_type == "saltbridge": - numeric_codes = parts[1:-3] - saltbridge_type = parts[-2] - if saltbridge_type == "NI": + highlighted_hbond_acceptor.append(lig_real_index) + + elif interaction_type == "hydrophobic": + highlighted_hydrophobic.append(lig_real_index) + + elif interaction_type == "waterbridge": + highlighted_waterbridge.append(lig_real_index) + + elif interaction_type == "pistacking": split_codes = numeric_codes[0].split(",") for code in split_codes: atom_index = int(code) complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_ni.append(lig_real_index - 1) - elif saltbridge_type == "PI": - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_pi.append(lig_real_index - 1) - - elif interaction_type == "pication": - numeric_codes = parts[1:-2] - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_pication.append(lig_real_index - 1) - - elif interaction_type == "metal": - ligidx = parts[1] - atom_index = int(ligidx) - complex_id = self.complex.select_atoms(f"id {atom_index}") - for atom in complex_id: - atom_name = atom.name - for lig_atom in self.lig_noh: - if lig_atom.name == atom_name: - lig_real_index = lig_atom.id - highlighted_metal.append(lig_real_index - 1) - + atom_name = complex_id[0].name if len(complex_id) > 0 else None + if atom_name is None: + continue + lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") + lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + if lig_real_index is not None: + highlighted_pistacking.append(lig_real_index) + + elif interaction_type == "halogen": + highlighted_halogen.append(lig_real_index) + + elif interaction_type == "saltbridge": + saltbridge_type = parts[-2] + if saltbridge_type == "NI": + split_codes = numeric_codes[0].split(",") + for code in split_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + atom_name = complex_id[0].name if len(complex_id) > 0 else None + if atom_name is None: + continue + lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") + lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + if lig_real_index is not None: + highlighted_ni.append(lig_real_index) + + elif saltbridge_type == "PI": + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + atom_name = complex_id[0].name if len(complex_id) > 0 else None + if atom_name is None: + continue + lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") + lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + if lig_real_index is not None: + highlighted_pi.append(lig_real_index) + + elif interaction_type == "pication": + highlighted_pication.append(lig_real_index) + + elif interaction_type == "metal": + highlighted_metal.append(lig_real_index) + + # Identify common indices between hbond_donor and hbond_acceptor for value in highlighted_hbond_donor[:]: if value in highlighted_hbond_acceptor: highlighted_hbond_donor.remove(value) @@ -300,15 +270,15 @@ def highlight_numbers(self, split_data, starting_idx): highlighted_metal, ) - def generate_interaction_dict(self, interaction_type, keys): + def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> Dict[int, Tuple[float, float, float]]: """Generates a dictionary of interaction RGB color model based on the provided interaction type. Args: interaction_type (str): The type of the interaction, for example 'hydrophobic'. - keys (list): List of the highlighted atoms that display an interaction. + keys (List[int]): List of the highlighted atoms that display an interaction. Returns: - dict: A dictionary with the interaction types are associated with their respective RGB color codes. + Dict[int, Tuple[float, float, float]]: A dictionary with the interaction types associated with their respective RGB color codes. """ interaction_dict = { "hbond_acceptor": (1.0, 0.6, 0.6), @@ -324,35 +294,52 @@ def generate_interaction_dict(self, interaction_type, keys): "metal": (1.0, 0.6, 0.0), } - interaction_dict = { + if interaction_type not in interaction_dict: + raise ValueError(f"Unknown interaction type: {interaction_type}") + + return { int(key): interaction_dict[interaction_type] for key in keys } - return interaction_dict - def update_dict(self, target_dict, *source_dicts): + def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *source_dicts: Dict[int, Tuple[float, float, float]]): """Updates the dictionary with the keys and values from other dictionaries. Args: - target_dict (dict): The dictionary that needs to be updated with new keys and values. - source_dicts (dict): One or multiple dictionaries that are used to update the target dictionary with new keys and values. + target_dict (Dict[int, Tuple[float, float, float]]): The dictionary that needs to be updated with new keys and values. + source_dicts (Dict[int, Tuple[float, float, float]]): One or more dictionaries with keys and values to be merged into the target dictionary. """ for source_dict in source_dicts: for key, value in source_dict.items(): - int_key = int(key) - if int_key not in target_dict: - target_dict[int_key] = value + if key in target_dict: + # Combine the RGB values (optional approach; adjust as needed) + existing_value = target_dict[key] + target_dict[key] = tuple( + min(1.0, max(0.0, (existing_value[i] + value[i]) / 2)) + for i in range(3) + ) + else: + target_dict[key] = value + +from typing import List, Optional +from PIL import Image +import pylab +import os class ImageMerger: def __init__( - self, binding_mode, occurrence_percent, split_data, merged_image_paths - ): + self, + binding_mode: str, + occurrence_percent: float, + split_data: List[str], + merged_image_paths: List[str] + ) -> None: self.binding_mode = binding_mode self.occurrence_percent = occurrence_percent self.split_data = split_data self.merged_image_paths = merged_image_paths - def create_and_merge_images(self): + def create_and_merge_images(self) -> List[str]: """Create and merge images to generate a legend for binding modes. Returns: @@ -374,17 +361,17 @@ def create_and_merge_images(self): for i, data in enumerate(filtered_split_data): y = data_points[i] label = data.split()[-1] - type = data.split()[-2] + type_ = data.split()[-2] # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function if label == "hydrophobic": (line,) = ax.plot( x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0 ) elif label == "hbond": - if type == "Acceptor": + if type_ == "Acceptor": (line,) = ax.plot( x, y, label=data, color=(1.0, 0.6, 0.6), linewidth=5.0 ) - elif type == "Donor": + elif type_ == "Donor": (line,) = ax.plot( x, y, label=data, color=(0.3, 0.5, 1.0), linewidth=5.0 ) @@ -409,11 +396,11 @@ def create_and_merge_images(self): x, y, label=data, color=(1.0, 0.6, 0.0), linewidth=5.0 ) elif label == "saltbridge": - if type == "NI": + if type_ == "NI": (line,) = ax.plot( x, y, label=data, color=(0.0, 0.0, 1.0), linewidth=5.0 ) - elif type == "PI": + elif type_ == "PI": (line,) = ax.plot( x, y, label=data, color=(1.0, 0.0, 0.0), linewidth=5.0 ) @@ -476,11 +463,11 @@ def create_and_merge_images(self): class FigureArranger: - def __init__(self, merged_image_paths, output_path): + def __init__(self, merged_image_paths: List[str], output_path: str) -> None: self.merged_image_paths = merged_image_paths self.output_path = output_path - def arranged_figure_generation(self): + def arranged_figure_generation(self) -> None: """Generate an arranged figure by arranging merged images in rows and columns.""" # Open the list of images merged_images = [Image.open(path) for path in self.merged_image_paths] From 70a2142c2dddb15381f5916b153cb0c91767d12e Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:00:28 +0200 Subject: [PATCH 04/23] Update preprocessing.py added type checking and type hinting --- openmmdl/openmmdl_analysis/preprocessing.py | 172 ++++++-------------- 1 file changed, 53 insertions(+), 119 deletions(-) diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py index 15928c92..0c7530b1 100644 --- a/openmmdl/openmmdl_analysis/preprocessing.py +++ b/openmmdl/openmmdl_analysis/preprocessing.py @@ -1,189 +1,139 @@ import MDAnalysis as mda -import subprocess import os import re +from typing import List, Optional import rdkit -import mdtraj as md from rdkit import Chem -from rdkit.Chem import Draw from rdkit.Chem import AllChem -from rdkit.Chem.Draw import rdMolDraw2D +from rdkit.Chem import Mol from openbabel import pybel - +import mdtraj as md class Preprocessing: def __init__(self): pass - def renumber_protein_residues(self, input_pdb, reference_pdb, output_pdb): - """Renumber protein residues in a molecular dynamics trajectory based on a reference structure. + def renumber_protein_residues( + self, input_pdb: str, reference_pdb: str, output_pdb: str + ) -> None: + """ + Renumber protein residues in a molecular dynamics trajectory based on a reference structure. Args: input_pdb (str): Path to the input PDB file representing the molecular dynamics trajectory to be renumbered. reference_pdb (str): Path to the reference PDB file representing the molecular dynamics trajectory used as a reference. - output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved.. + output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved. """ - # Load trajectories traj_input = md.load(input_pdb) traj_reference = md.load(reference_pdb) - # Get the topology DataFrames input_top_df, _ = traj_input.topology.to_dataframe() ref_top_df, _ = traj_reference.topology.to_dataframe() - # List of common protein residue names including additional residues protein_residues = [ - "ALA", - "ARG", - "ASN", - "ASP", - "CYS", - "GLN", - "GLU", - "GLY", - "HIS", - "ILE", - "LEU", - "LYS", - "MET", - "PHE", - "PRO", - "SER", - "THR", - "TRP", - "TYR", - "VAL", - "ACE", - "NME", - "HIE", - "HID", - "HIP", + "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", + "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", + "ACE", "NME", "HIE", "HID", "HIP" ] - # Iterate over each chain in the reference topology for chain_id in ref_top_df["chainID"].unique(): - # Extract residue indices from the reference topology for the current chain ref_residue_indices = ( ref_top_df[ (ref_top_df["chainID"] == chain_id) & ref_top_df["resName"].isin(protein_residues) - ]["resSeq"].values - - 1 + ]["resSeq"].values - 1 ) - # Update residue indices in the input topology DataFrame for the current chain - mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin( - protein_residues - ) + mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(protein_residues) input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1 - # Create a new topology from the modified DataFrame new_top = md.Topology.from_dataframe(input_top_df) - - # Update the topology in the input trajectory new_traj = traj_input.slice(range(traj_input.n_frames)) new_traj.topology = new_top - - # Save the renumbered trajectory to a new PDB file new_traj.save(output_pdb) - def increase_ring_indices(self, ring, lig_index): - """Increases the atom indices in a ring of the ligand obtained from the ligand to fit the atom indices present in the protein-ligand complex. + def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]: + """ + Increases the atom indices in a ring of the ligand obtained from the ligand to fit the atom indices present in the protein-ligand complex. Args: - ring (str): A list of atom indices belonging to a ring that need to be modified. - lig_index (int): An integer that is the first number of the ligand atom indices obtained from the protein-ligand, which is used to modify the ring indices + ring (List[int]): A list of atom indices belonging to a ring that need to be modified. + lig_index (int): An integer that is the first number of the ligand atom indices obtained from the protein-ligand, which is used to modify the ring indices. Returns: - list: A new list with modified atom indicies. + List[int]: A new list with modified atom indices. """ return [atom_idx + lig_index for atom_idx in ring] - def convert_ligand_to_smiles(self, input_sdf, output_smi): - """Converts ligand structures from an SDF file to SMILES :) format + def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None: + """ + Converts ligand structures from an SDF file to SMILES format. Args: - input_sdf (str): Path to the SDF file with the ligand that wll be converted. - output_smi (str): Path to the output SMILES files. + input_sdf (str): Path to the SDF file with the ligand that will be converted. + output_smi (str): Path to the output SMILES file. """ - # Create a molecule supplier from an SDF file mol_supplier = Chem.SDMolSupplier(input_sdf) - - # Open the output SMILES file for writing with open(output_smi, "w") as output_file: - # Iterate through molecules and convert each to SMILES for mol in mol_supplier: - if mol is not None: # Make sure the molecule was successfully read + if mol is not None: smiles = Chem.MolToSmiles(mol) output_file.write(smiles + "\n") else: - print("nono") + print("Molecule could not be read.") - def process_pdb_file(self, input_pdb_filename): - """Process a PDB file to make it compatible with the openmmdl_analysis package. + def process_pdb_file(self, input_pdb_filename: str) -> None: + """ + Process a PDB file to make it compatible with the openmmdl_analysis package. Args: - input_pdb_filename (str): path to the input PDB file + input_pdb_filename (str): Path to the input PDB file. """ - # Load the PDB file u = mda.Universe(input_pdb_filename) - - # Iterate through the topology to modify residue names for atom in u.atoms: resname = atom.resname - - # Check if the residue name is one of the specified water names if resname in ["SPC", "TIP3", "TIP4", "WAT", "T3P", "T4P", "T5P"]: atom.residue.resname = "HOH" elif resname == "*": atom.residue.resname = "UNK" - - # Save the modified topology to a new PDB file u.atoms.write(input_pdb_filename) def extract_and_save_ligand_as_sdf( - self, input_pdb_filename, output_filename, target_resname - ): - """Extract and save the ligand from the receptor ligand complex PDB file into a new PDB file by itself. + self, input_pdb_filename: str, output_filename: str, target_resname: str + ) -> None: + """ + Extract and save the ligand from the receptor-ligand complex PDB file into a new SDF file. Args: - input_pdb_filename (str): name of the input PDB file - output_pdb_filename (str): name of the output SDF file - target_resname (str): resname of the ligand in the original PDB file + input_pdb_filename (str): Name of the input PDB file. + output_filename (str): Name of the output SDF file. + target_resname (str): Residue name of the ligand in the original PDB file. """ - # Load the PDB file using MDAnalysis u = mda.Universe(input_pdb_filename) - - # Find the ligand by its residue name ligand_atoms = u.select_atoms(f"resname {target_resname}") if len(ligand_atoms) == 0: - print( - f"No ligand with residue name '{target_resname}' found in the PDB file." - ) + print(f"No ligand with residue name '{target_resname}' found in the PDB file.") return - # Create a new Universe with only the ligand ligand_universe = mda.Merge(ligand_atoms) - - # Save the ligand Universe as a PDB file ligand_universe.atoms.write("lig.pdb") - # use openbabel to convert pdb to sdf mol = next(pybel.readfile("pdb", "lig.pdb")) mol.write("sdf", output_filename, overwrite=True) - # remove the temporary pdb file os.remove("lig.pdb") - def renumber_atoms_in_residues(self, input_pdb_file, output_pdb_file, lig_name): - """Renumer the atoms of the ligand in the topology PDB file. + def renumber_atoms_in_residues( + self, input_pdb_file: str, output_pdb_file: str, lig_name: str + ) -> None: + """ + Renumber the atoms of the ligand in the topology PDB file. Args: input_pdb_file (str): Path to the initial PDB file. output_pdb_file (str): Path to the output PDB file. lig_name (str): Name of the ligand in the input PDB file. """ - # Read the input PDB file with open(input_pdb_file, "r") as f: pdb_lines = f.readlines() @@ -192,40 +142,25 @@ def renumber_atoms_in_residues(self, input_pdb_file, output_pdb_file, lig_name): for line in pdb_lines: if line.startswith("ATOM"): - # Extract information from the ATOM line atom_serial = int(line[6:11]) atom_name = line[12:16].strip() residue_name = line[17:20].strip() - # Check if the residue is residue_name if residue_name == lig_name: - # Extract the element from the atom name (assuming it starts with a capital letter) element_match = re.match(r"([A-Z]+)", atom_name) - if element_match: - element = element_match.group(1) - else: - element = atom_name - - # Increment the count for the current element in the current residue_name residue - lig_residue_elements[element] = ( - lig_residue_elements.get(element, 0) + 1 - ) - - # Update the atom name based on the element count + element = element_match.group(1) if element_match else atom_name + lig_residue_elements[element] = lig_residue_elements.get(element, 0) + 1 new_atom_name = f"{element}{lig_residue_elements[element]}" - - # Replace the old atom name with the new one line = line[:12] + f"{new_atom_name:4}" + line[16:] - # Append the line to the new PDB lines new_pdb_lines.append(line) - # Write the modified PDB lines to the output file with open(output_pdb_file, "w") as f: f.writelines(new_pdb_lines) - def replace_atom_type(self, data): - """Replace wrong ligand atom types in the topology PDB file. + def replace_atom_type(self, data: str) -> str: + """ + Replace incorrect ligand atom types in the topology PDB file. Args: data (str): Text of the initial PDB file. @@ -236,18 +171,17 @@ def replace_atom_type(self, data): lines = data.split("\n") for i, line in enumerate(lines): if " LIG X" in line: - # Extract the last column which contains the atom type (O/N/H) atom_type = line[12:13].strip() - # Replace 'X' with the correct atom type lines[i] = line.replace(" LIG X", f" LIG {atom_type}") return "\n".join(lines) - def process_pdb(self, input_file, output_file): - """Wrapper function to process a PDB file. + def process_pdb(self, input_file: str, output_file: str) -> None: + """ + Wrapper function to process a PDB file. Args: input_file (str): Path to the input PDB file. - output_file (str): Path of the output PDB file. + output_file (str): Path to the output PDB file. """ with open(input_file, "r") as f: pdb_data = f.read() From 66bd74d87bce112dc0c91a5e182063314af1e99e Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:07:57 +0200 Subject: [PATCH 05/23] Update barcode_generation.py added type checking and type hinting --- .../openmmdl_analysis/barcode_generation.py | 103 +++++++++++++----- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py index e362bb83..2ec99d02 100644 --- a/openmmdl/openmmdl_analysis/barcode_generation.py +++ b/openmmdl/openmmdl_analysis/barcode_generation.py @@ -1,20 +1,28 @@ import os import numpy as np +import pandas as pd import matplotlib.pyplot as plt +from typing import List, Dict, Union class BarcodeGenerator: - def __init__(self, df): + def __init__(self, df: pd.DataFrame): """ Initializes the BarcodeGenerator with a dataframe. Args: - df (pandas dataframe): Dataframe containing all interactions from plip analysis (typically df_all) + df (pd.DataFrame): Dataframe containing all interactions from plip analysis (typically df_all). """ self.df = df self.interactions = self.gather_interactions() - def gather_interactions(self): + def gather_interactions(self) -> Dict[str, pd.Index]: + """ + Gathers columns related to different types of interactions. + + Returns: + Dict[str, pd.Index]: Dictionary where keys are interaction types and values are columns corresponding to those interactions. + """ hydrophobic_interactions = self.df.filter(regex="hydrophobic").columns acceptor_interactions = self.df.filter(regex="Acceptor_hbond").columns donor_interactions = self.df.filter(regex="Donor_hbond").columns @@ -39,15 +47,15 @@ def gather_interactions(self): "metal": metal_interactions, } - def generate_barcode(self, interaction): + def generate_barcode(self, interaction: str) -> np.ndarray: """ Generates barcodes for a given interaction. Args: - interaction (str): Name of the interaction to generate a barcode for + interaction (str): Name of the interaction to generate a barcode for. Returns: - np.array: Binary array with 1 representing the interaction is present in the corresponding frame + np.ndarray: Binary array with 1 representing the interaction is present in the corresponding frame. """ barcode = [] unique_frames = self.df["FRAME"].unique() @@ -61,20 +69,20 @@ def generate_barcode(self, interaction): return np.array(barcode) - def generate_waterids_barcode(self, interaction): + def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]: """ Generates a barcode containing corresponding water ids for a given interaction. Args: - interaction (str): Name of the interaction to generate a barcode for + interaction (str): Name of the interaction to generate a barcode for. Returns: - list: List of water ids for the frames where the interaction is present, 0 if no interaction present + List[Union[int, int]]: List of water ids for the frames where the interaction is present, 0 if no interaction present. """ water_id_list = [] waterid_barcode = [] - for index, row in self.df.iterrows(): + for _, row in self.df.iterrows(): if row[interaction] == 1: water_id_list.append(int(float(row["WATER_IDX"]))) @@ -88,15 +96,15 @@ def generate_waterids_barcode(self, interaction): return waterid_barcode - def interacting_water_ids(self, waterbridge_interactions): - """Generates a list of all water ids that form water bridge interactions. + def interacting_water_ids(self, waterbridge_interactions: List[str]) -> List[int]: + """ + Generates a list of all water ids that form water bridge interactions. Args: - df_all (pandas dataframe): dataframe containing all interactions from plip analysis (typicaly df_all) - waterbridge_interactions (list): list of strings containing the names of all water bridge interactions + waterbridge_interactions (List[str]): List of strings containing the names of all water bridge interactions. Returns: - list: list of all unique water ids that form water bridge interactions + List[int]: List of all unique water ids that form water bridge interactions. """ interacting_waters = [] for waterbridge_interaction in waterbridge_interactions: @@ -107,12 +115,34 @@ def interacting_water_ids(self, waterbridge_interactions): return list(set(interacting_waters)) +class BarcodeGenerator: + def __init__(self, df: pd.DataFrame): + self.df = df + + def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]: + # Placeholder implementation + return [] + + class BarcodePlotter: - def __init__(self, df_all): + def __init__(self, df_all: pd.DataFrame): + """ + Initializes the BarcodePlotter with a dataframe and BarcodeGenerator instance. + + Args: + df_all (pd.DataFrame): Dataframe containing all interactions from plip analysis. + """ self.df_all = df_all self.barcode_gen = BarcodeGenerator(df_all) - def plot_barcodes(self, barcodes, save_path): + def plot_barcodes(self, barcodes: Dict[str, np.ndarray], save_path: str) -> None: + """ + Plots barcodes and saves the figure to the specified path. + + Args: + barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays. + save_path (str): Path to save the plotted figure. + """ if not barcodes: print("No barcodes to plot.") return @@ -154,8 +184,19 @@ def plot_barcodes(self, barcodes, save_path): plt.savefig(save_path, dpi=300, bbox_inches="tight") def plot_waterbridge_piechart( - self, waterbridge_barcodes, waterbridge_interactions, fig_type - ): + self, + waterbridge_barcodes: Dict[str, np.ndarray], + waterbridge_interactions: List[str], + fig_type: str + ) -> None: + """ + Plots pie charts for waterbridge interactions and saves them to files. + + Args: + waterbridge_barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays. + waterbridge_interactions (List[str]): List of water bridge interaction names. + fig_type (str): File extension for the saved figures (e.g., 'png', 'pdf'). + """ if not waterbridge_barcodes: print("No Piecharts to plot.") return @@ -201,7 +242,7 @@ def plot_waterbridge_piechart( plt.pie( values, labels=labels, - autopct=lambda pct: f"{pct:.1f}%\n({int(round(pct/100.0 * sum(values)))})", + autopct=lambda pct: f"{pct:.1f}%\n({int(round(pct / 100.0 * sum(values)))})", shadow=False, startangle=140, ) @@ -227,8 +268,18 @@ def plot_waterbridge_piechart( dpi=300, ) - def plot_barcodes_grouped(self, interactions, interaction_type, fig_type): - ligatoms_dict = {} + def plot_barcodes_grouped( + self, interactions: List[str], interaction_type: str, fig_type: str + ) -> None: + """ + Plots grouped barcodes for interactions and saves the figure to a file. + + Args: + interactions (List[str]): List of interaction names. + interaction_type (str): Type of interaction for grouping. + fig_type (str): File extension for the saved figure (e.g., 'png', 'pdf'). + """ + ligatoms_dict: Dict[str, List[str]] = {} for interaction in interactions: ligatom = interaction.split("_") ligatom.pop(0) @@ -246,10 +297,10 @@ def plot_barcodes_grouped(self, interactions, interaction_type, fig_type): ligatom = "_".join(ligatom) ligatoms_dict.setdefault(ligatom, []).append(interaction) - total_interactions = {} - for ligatom in ligatoms_dict: - ligatom_interaction_barcodes = {} - for interaction in ligatoms_dict[ligatom]: + total_interactions: Dict[str, np.ndarray] = {} + for ligatom, interactions_list in ligatoms_dict.items(): + ligatom_interaction_barcodes: Dict[str, np.ndarray] = {} + for interaction in interactions_list: barcode = self.barcode_gen.generate_barcode(interaction) ligatom_interaction_barcodes[interaction] = barcode os.makedirs(f"./Barcodes/{ligatom}", exist_ok=True) From 50ab1ba7469c6bb90206ecf6d1fd53a4b659390f Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:12:39 +0200 Subject: [PATCH 06/23] Update markov_state_figure_generation.py added type checking and hinting --- .../markov_state_figure_generation.py | 239 +++++++----------- 1 file changed, 95 insertions(+), 144 deletions(-) diff --git a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py index fd9d398d..7e40c020 100644 --- a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py +++ b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py @@ -2,18 +2,18 @@ import matplotlib.pyplot as plt from matplotlib.patches import Patch import os - +from typing import Dict, List, Tuple, Union class MarkovChainAnalysis: - def __init__(self, min_transition): + def __init__(self, min_transition: float): self.min_transition = min_transition - self.min_transitions = self.calculate_min_transitions() + self.min_transitions: List[float] = self.calculate_min_transitions() - def calculate_min_transitions(self): + def calculate_min_transitions(self) -> List[float]: """Calculates a list based on the minimum transition time provided values and returns it in factors 1, 2, 5, 10. Returns: - list: List with the minimum transition time with factors 1, 2, 5, 10. + List[float]: List with the minimum transition time with factors 1, 2, 5, 10. """ min_transitions = [ self.min_transition, @@ -24,42 +24,47 @@ def calculate_min_transitions(self): return min_transitions def generate_transition_graph( - self, total_frames, combined_dict, fig_type="png", font_size=36, size_node=200 - ): + self, + total_frames: int, + combined_dict: Dict[str, List[str]], + fig_type: str = "png", + font_size: int = 36, + size_node: int = 200 + ) -> None: """Generate Markov Chain plots based on transition probabilities. Args: total_frames (int): The number of frames in the protein-ligand MD simulation. - combined_dict (dict): A dictionary with the information of the Binding Modes and their order of appearance during the simulation for all frames. + combined_dict (Dict[str, List[str]]): A dictionary with the information of the Binding Modes and their order of appearance during the simulation for all frames. fig_type (str, optional): File type for the output figures. Default is 'png'. font_size (int, optional): The font size for the node labels. The default value is set to 36. size_node (int, optional): The size of the nodes in the Markov Chain plot. The default value is set to 200. """ # Calculate the number of elements in each part - total_length = len(combined_dict["all"]) - part_length = total_length // 3 - remaining_length = total_length % 3 + total_length: int = len(combined_dict["all"]) + part_length: int = total_length // 3 + remaining_length: int = total_length % 3 # Divide the 'all_data' into three parts - part1_length = part_length + remaining_length - part2_length = part_length - part1_data = combined_dict["all"][:part1_length] - part2_data = combined_dict["all"][part1_length : part1_length + part2_length] - part3_data = combined_dict["all"][part1_length + part2_length :] + part1_length: int = part_length + remaining_length + part2_length: int = part_length + part1_data: List[str] = combined_dict["all"][:part1_length] + part2_data: List[str] = combined_dict["all"][part1_length: part1_length + part2_length] + part3_data: List[str] = combined_dict["all"][part1_length + part2_length:] # Count the occurrences of each node in each part - part1_node_occurrences = { + part1_node_occurrences: Dict[str, int] = { node: part1_data.count(node) for node in set(part1_data) } - part2_node_occurrences = { + part2_node_occurrences: Dict[str, int] = { node: part2_data.count(node) for node in set(part2_data) } - part3_node_occurrences = { + part3_node_occurrences: Dict[str, int] = { node: part3_data.count(node) for node in set(part3_data) } # Create the legend - legend_labels = { + legend_labels: Dict[str, str] = { "Blue": "Binding mode not in top 10 occurrence", "Green": "Binding Mode occurrence mostly in first third of frames", "Orange": "Binding Mode occurrence mostly in second third of frames", @@ -67,134 +72,102 @@ def generate_transition_graph( "Yellow": "Binding Mode occurs throughout all trajectory equally", } - legend_colors = ["skyblue", "green", "orange", "red", "yellow"] + legend_colors: List[str] = ["skyblue", "green", "orange", "red", "yellow"] - legend_handles = [ + legend_handles: List[Patch] = [ Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels.values()) ] # Get the top 10 nodes with the most occurrences - node_occurrences = { + node_occurrences: Dict[str, int] = { node: combined_dict["all"].count(node) for node in set(combined_dict["all"]) } - top_10_nodes = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[ - :10 - ] + top_10_nodes: List[str] = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10] for min_transition_percent in self.min_transitions: - min_prob = min_transition_percent / 100 # Convert percentage to probability + min_prob: float = min_transition_percent / 100 # Convert percentage to probability # Create a directed graph - G = nx.DiGraph() + G: nx.DiGraph = nx.DiGraph() # Count the occurrences of each transition and self-loop - transitions = {} - self_loops = {} + transitions: Dict[Tuple[str, str], int] = {} + self_loops: Dict[Tuple[str, str], int] = {} for i in range(len(combined_dict["all"]) - 1): - current_state = combined_dict["all"][i] - next_state = combined_dict["all"][i + 1] + current_state: str = combined_dict["all"][i] + next_state: str = combined_dict["all"][i + 1] if current_state == next_state: # Check for self-loop - self_loop = (current_state, next_state) + self_loop: Tuple[str, str] = (current_state, next_state) self_loops[self_loop] = self_loops.get(self_loop, 0) + 1 else: - transition = (current_state, next_state) + transition: Tuple[str, str] = (current_state, next_state) transitions[transition] = transitions.get(transition, 0) + 1 # Add edges to the graph with their probabilities for transition, count in transitions.items(): current_state, next_state = transition - probability = ( - count / len(combined_dict["all"]) * 100 - ) # Convert probability to percentage + probability: float = (count / len(combined_dict["all"])) * 100 # Convert probability to percentage if probability >= min_transition_percent: G.add_edge(current_state, next_state, weight=probability) # Include the reverse transition with a different color - reverse_transition = (next_state, current_state) - reverse_count = transitions.get( - reverse_transition, 0 - ) # Use the correct count for the reverse transition - reverse_probability = ( - reverse_count / len(combined_dict["all"]) * 100 - ) - G.add_edge( - next_state, - current_state, - weight=reverse_probability, - reverse=True, - ) + reverse_transition: Tuple[str, str] = (next_state, current_state) + reverse_count: int = transitions.get(reverse_transition, 0) # Use the correct count for the reverse transition + reverse_probability: float = (reverse_count / len(combined_dict["all"])) * 100 + G.add_edge(next_state, current_state, weight=reverse_probability, reverse=True) # Add self-loops to the graph with their probabilities for self_loop, count in self_loops.items(): - state = self_loop[0] - probability = ( - count / len(combined_dict["all"]) * 100 - ) # Convert probability to percentage + state: str = self_loop[0] + probability: float = (count / len(combined_dict["all"])) * 100 # Convert probability to percentage if probability >= min_transition_percent: G.add_edge(state, state, weight=probability) # Calculate transition probabilities for each direction (excluding self-loops) - transition_probabilities_forward = {} - transition_probabilities_backward = {} - transition_occurrences_forward = {} - transition_occurrences_backward = {} + transition_probabilities_forward: Dict[Tuple[str, str], float] = {} + transition_probabilities_backward: Dict[Tuple[str, str], float] = {} + transition_occurrences_forward: Dict[Tuple[str, str], float] = {} + transition_occurrences_backward: Dict[Tuple[str, str], float] = {} for transition, count in transitions.items(): start_state, end_state = transition - forward_transition = (start_state, end_state) - backward_transition = (end_state, start_state) + forward_transition: Tuple[str, str] = (start_state, end_state) + backward_transition: Tuple[str, str] = (end_state, start_state) # Separate counts for forward and backward transitions - forward_count = transitions.get(forward_transition, 0) - backward_count = transitions.get(backward_transition, 0) + forward_count: int = transitions.get(forward_transition, 0) + backward_count: int = transitions.get(backward_transition, 0) - transition_probabilities_forward[forward_transition] = ( - forward_count / node_occurrences[start_state] * 100 - ) + transition_probabilities_forward[forward_transition] = (forward_count / node_occurrences[start_state]) * 100 + transition_occurrences_forward[forward_transition] = (forward_count / len(combined_dict["all"])) * 100 - transition_occurrences_forward[forward_transition] = ( - forward_count / len(combined_dict["all"]) * 100 - ) - - transition_probabilities_backward[backward_transition] = ( - backward_count / node_occurrences[end_state] * 100 - ) - - transition_occurrences_backward[backward_transition] = ( - backward_count / len(combined_dict["all"]) * 100 - ) + transition_probabilities_backward[backward_transition] = (backward_count / node_occurrences[end_state]) * 100 + transition_occurrences_backward[backward_transition] = (backward_count / len(combined_dict["all"])) * 100 # Calculate self-loop probabilities - self_loop_probabilities = {} - self_loop_occurences = {} + self_loop_probabilities: Dict[str, float] = {} + self_loop_occurences: Dict[str, float] = {} for self_loop, count in self_loops.items(): - state = self_loop[0] + state: str = self_loop[0] self_loop_probabilities[state] = count / node_occurrences[state] self_loop_occurences[state] = count / len(combined_dict["all"]) * 100 # Generate the Markov Chain plot plt.figure(figsize=(60, 60)) # Increased figure size - plt.title( - f"Markov Chain Plot {min_transition_percent}% Frames Transition", - fontsize=72, - ) + plt.title(f"Markov Chain Plot {min_transition_percent}% Frames Transition", fontsize=72) # Draw nodes and edges - pos = nx.spring_layout( - G, k=2, seed=42 - ) # Increased distance between nodes (k=2) - edge_colors = [] + pos: Dict[str, Tuple[float, float]] = nx.spring_layout(G, k=2, seed=42) # Increased distance between nodes (k=2) + edge_colors: List[str] = [] for u, v, data in G.edges(data=True): - weight = data["weight"] + weight: float = data["weight"] if u == v: # Check if it is a self-loop edge_colors.append("green") # Set green color for self-loop arrows - width = 0.1 # Make self-loop arrows smaller - connection_style = ( - "arc3,rad=-0.1" # Make the self-loops more curved and compact - ) + width: float = 0.1 # Make self-loop arrows smaller + connection_style: str = "arc3,rad=-0.1" # Make the self-loops more curved and compact nx.draw_networkx_edges( G, pos, @@ -205,18 +178,16 @@ def generate_transition_graph( connectionstyle=connection_style, ) elif weight >= min_transition_percent: - edge_colors.append( - "black" - ) # Highlight significant transitions in red + edge_colors.append("black") # Highlight significant transitions in black # Check if both nodes are present before adding labels if G.has_node(u) and G.has_node(v): - width = 4.0 - forward_label = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" - backward_label = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" - edge_label = f"{forward_label}\n{backward_label}" + width: float = 4.0 + forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" + backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + edge_label: str = f"{forward_label}\n{backward_label}" - connection_style = "arc3,rad=-0.1" + connection_style: str = "arc3,rad=-0.1" nx.draw_networkx_edges( G, pos, @@ -226,22 +197,18 @@ def generate_transition_graph( edge_color=edge_colors[-1], connectionstyle=connection_style, ) - nx.draw_networkx_edge_labels( - G, pos, edge_labels={(u, v): edge_label}, font_size=26 - ) + nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=26) else: - edge_colors.append( - "grey" - ) # Use black for non-significant transitions - width = 0.5 + edge_colors.append("grey") # Use grey for non-significant transitions + width: float = 0.5 # Check if both nodes are present before adding labels if G.has_node(u) and G.has_node(v): - forward_label = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" - backward_label = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" - edge_label = f"{forward_label}\n{backward_label}" + forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" + backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + edge_label: str = f"{forward_label}\n{backward_label}" - connection_style = "arc3,rad=-0.1" + connection_style: str = "arc3,rad=-0.1" nx.draw_networkx_edges( G, pos, @@ -251,23 +218,15 @@ def generate_transition_graph( edge_color=edge_colors[-1], connectionstyle=connection_style, ) - nx.draw_networkx_edge_labels( - G, pos, edge_labels={(u, v): edge_label}, font_size=36 - ) + nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=36) # Update the node colors based on their appearance percentages in each part - node_colors = [] + node_colors: List[str] = [] for node in G.nodes(): if node in top_10_nodes: - part1_percentage = ( - part1_node_occurrences.get(node, 0) / node_occurrences[node] - ) - part2_percentage = ( - part2_node_occurrences.get(node, 0) / node_occurrences[node] - ) - part3_percentage = ( - part3_node_occurrences.get(node, 0) / node_occurrences[node] - ) + part1_percentage: float = (part1_node_occurrences.get(node, 0) / node_occurrences[node]) + part2_percentage: float = (part2_node_occurrences.get(node, 0) / node_occurrences[node]) + part3_percentage: float = (part3_node_occurrences.get(node, 0) / node_occurrences[node]) if part1_percentage > 0.5: node_colors.append("green") @@ -281,22 +240,20 @@ def generate_transition_graph( node_colors.append("skyblue") # Draw nodes with sizes correlated to occurrences and color top 10 nodes - node_size = [size_node * node_occurrences[node] for node in G.nodes()] - nx.draw_networkx_nodes( - G, pos, node_size=node_size, node_color=node_colors, alpha=0.8 - ) + node_size: List[int] = [size_node * node_occurrences[node] for node in G.nodes()] + nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_colors, alpha=0.8) # Draw node labels with occurrence percentage and self-loop probability for nodes with edges - node_labels = {} + node_labels: Dict[str, str] = {} for node in G.nodes(): if G.degree(node) > 0: # Check if the node has at least one edge - edges_with_node = [ + edges_with_node: List[Tuple[str, str, float]] = [ (u, v, data["weight"]) for u, v, data in G.edges(data=True) if u == node or v == node ] - relevant_edges = [ + relevant_edges: List[Tuple[str, str, float]] = [ edge for edge in edges_with_node if edge[2] >= min_transition_percent @@ -304,14 +261,10 @@ def generate_transition_graph( if relevant_edges: if node in top_10_nodes: - node_occurrence_percentage = ( - node_occurrences[node] / len(combined_dict["all"]) * 100 - ) - self_loop_probability = ( - self_loop_probabilities.get(node, 0) * 100 - ) - self_loop_occurence = self_loop_occurences.get(node, 0) - node_label = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%" + node_occurrence_percentage: float = (node_occurrences[node] / len(combined_dict["all"])) * 100 + self_loop_probability: float = self_loop_probabilities.get(node, 0) * 100 + self_loop_occurence: float = self_loop_occurences.get(node, 0) + node_label: str = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%" node_labels[node] = node_label else: node_labels[node] = node @@ -332,11 +285,9 @@ def generate_transition_graph( plt.tight_layout() # Save the plot - plot_filename = f"markov_chain_plot_{min_transition_percent}.{fig_type}" - plot_path = os.path.join("Binding_Modes_Markov_States", plot_filename) - os.makedirs( - "Binding_Modes_Markov_States", exist_ok=True - ) # Create the folder if it doesn't exist + plot_filename: str = f"markov_chain_plot_{min_transition_percent}.{fig_type}" + plot_path: str = os.path.join("Binding_Modes_Markov_States", plot_filename) + os.makedirs("Binding_Modes_Markov_States", exist_ok=True) # Create the folder if it doesn't exist plt.savefig(plot_path, dpi=300) plt.clf() # Clear the current figure for the next iteration From b43a021b075b0843f7927b2e8f871114b609d29b Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:19:57 +0200 Subject: [PATCH 07/23] Update find_stable_waters.py added type checking and hinting --- .../openmmdl_analysis/find_stable_waters.py | 116 ++++++------------ 1 file changed, 39 insertions(+), 77 deletions(-) diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index a67659a3..88e6ccb2 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -5,76 +5,62 @@ from sklearn.cluster import DBSCAN from tqdm import tqdm from io import StringIO -from Bio.PDB import PDBParser - +from Bio.PDB import PDBParser, Structure +from typing import Tuple, Dict, List, Optional class StableWaters: - def __init__(self, trajectory, topology, water_eps): + def __init__(self, trajectory: str, topology: str, water_eps: float) -> None: self.trajectory = trajectory self.topology = topology self.u = mda.Universe(self.topology, self.trajectory) self.water_eps = water_eps - def trace_waters(self, output_directory): - """trace the water molecules in a trajectory and write all which move below one Angstrom distance. To adjust the distance alter the integer + def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: + """Trace the water molecules in a trajectory and write all which move below one Angstrom distance. To adjust the distance alter the integer Args: - topology (str): Path to the topology file. - trajectory (str): Path to the trajectory file. output_directory (str): Directory where output files will be saved. Returns: - pd.DataFrame: DataFrame containing stable water coordinates. - int: Total number of frames. + Tuple[pd.DataFrame, int]: DataFrame containing stable water coordinates and total number of frames. """ - # Get the total number of frames for the progress bar total_frames = len(self.u.trajectory) - # Create an empty DataFrame to store stable water coordinates stable_waters = pd.DataFrame( columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"] ) - # Initialize variables for the previous frame's data prev_frame_coords = {} - # Iterate through frames with tqdm for the progress bar for ts in tqdm( self.u.trajectory, total=total_frames, - desc="Processing frames for the wateranalysis", + desc="Processing frames for the water analysis", ): frame_num = ts.frame frame_coords = {} - # Iterate through oxygen atoms of the specified water type - # for atom in u.select_atoms(f"resname {water_type} and name O"): - for atom in self.u.select_atoms(f"resname HOH and name O"): + for atom in self.u.select_atoms("resname HOH and name O"): frame_coords[atom.index] = ( atom.position[0], atom.position[1], atom.position[2], ) - # Check if it's not the first frame if frame_num > 0: stable_coords = [] - # Iterate through the oxygen atoms in the current frame for atom_index, coords in frame_coords.items(): prev_coords = prev_frame_coords.get( atom_index, coords - ) # Get previous coordinates or use current if not found + ) - # Calculate the distance between current and previous coordinates distance = np.linalg.norm(np.array(coords) - np.array(prev_coords)) - # If the distance is less than 1 Angstrom, consider it a stable water if distance < 1: stable_coords.append( (frame_num, atom_index, coords[0], coords[1], coords[2]) ) - # Append stable water coordinates to the stable_waters DataFrame - if stable_coords: # Check if stable_coords is not empty + if stable_coords: stable_waters = pd.concat( [ stable_waters, @@ -91,7 +77,6 @@ def trace_waters(self, output_directory): ] ) - # Update the previous frame's coordinates prev_frame_coords = frame_coords stable_waters.to_csv( @@ -100,21 +85,19 @@ def trace_waters(self, output_directory): return stable_waters, total_frames def perform_clustering_and_writing( - self, stable_waters, cluster_eps, total_frames, output_directory - ): + self, stable_waters: pd.DataFrame, cluster_eps: float, total_frames: int, output_directory: str + ) -> None: """ Perform DBSCAN clustering on the stable water coordinates, and write the clusters and their representatives to PDB files. Args: stable_waters (pd.DataFrame): DataFrame containing stable water coordinates. - cluster_eps (float): DBSCAN clustering epsilon parameter. This is in Angstrom in this case, and defines which Water distances should be within one cluster + cluster_eps (float): DBSCAN clustering epsilon parameter. total_frames (int): Total number of frames. output_directory (str): Directory where output files will be saved. """ - # Feature extraction: XYZ coordinates X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]] - # List of percentages to iterate over percentage_values = [25, 50, 75, 90, 99] for percent in percentage_values: @@ -138,8 +121,8 @@ def perform_clustering_and_writing( ) def write_pdb_clusters_and_representatives( - self, clustered_waters, min_samples, output_sub_directory - ): + self, clustered_waters: pd.DataFrame, min_samples: int, output_sub_directory: str + ) -> None: """ Writes the clusters and their representatives to PDB files. @@ -147,7 +130,6 @@ def write_pdb_clusters_and_representatives( clustered_waters (pd.DataFrame): DataFrame containing clustered water coordinates. min_samples (int): Minimum number of samples for DBSCAN clustering. output_sub_directory (str): Subdirectory where output PDB files will be saved. - """ atom_counter = 1 pdb_file_counter = 1 @@ -156,7 +138,7 @@ def write_pdb_clusters_and_representatives( os.makedirs(output_sub_directory, exist_ok=True) with pd.option_context( "display.max_rows", None - ): # Temporarily set display options + ): for label, cluster in clustered_waters.groupby("Cluster_Label"): pdb_lines = [] for _, row in cluster.iterrows(): @@ -166,7 +148,6 @@ def write_pdb_clusters_and_representatives( pdb_lines.append(pdb_line) atom_counter += 1 - # Write the current cluster to a new PDB file output_filename = os.path.join( output_sub_directory, f"cluster_{label}.pdb" ) @@ -176,7 +157,6 @@ def write_pdb_clusters_and_representatives( pdb_file_counter += 1 - # Write representative water molecules to a PDB file representative_waters = clustered_waters.groupby("Cluster_Label").mean() representative_waters.reset_index(inplace=True) representative_filename = os.path.join( @@ -188,36 +168,27 @@ def write_pdb_clusters_and_representatives( pdb_line = f"ATOM{index + 1:6} O WAT A{index + 1:4} {x:8.3f}{y:8.3f}{z:8.3f} 1.00 0.00 O\n" pdb_file.write(pdb_line) - # Example usage - # stable_waters_pipeline("topology_file", "trajectory_file", 0.5) - def stable_waters_pipeline(self, output_directory="./stableWaters"): + def stable_waters_pipeline(self, output_directory: str = "./stableWaters") -> None: """Function to run the pipeline to extract stable water clusters, and their representatives from a PDB & DCD file Args: - topology (str): Path to the topology file. - trajectory (str): Path to the trajectory file. - water_eps (float): DBSCAN clustering epsilon parameter. output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters". - """ - # Load the PDB and DCD files output_directory += "_clusterEps_" strEps = str(self.water_eps).replace(".", "") output_directory += strEps os.makedirs(output_directory, exist_ok=True) - # Create a stable waters list by calling the process_trajectory_and_cluster function stable_waters, total_frames = self.trace_waters(output_directory) - # Now call perform_clustering_and_writing with the returned values self.perform_clustering_and_writing( stable_waters, self.water_eps, total_frames, output_directory ) - def filter_and_parse_pdb(protein_pdb): - """This function reads in a PDB and returns the structure with bioparser. + def filter_and_parse_pdb(self, protein_pdb: str) -> Structure: + """Reads in a PDB and returns the structure with bioparser. Args: protein_pdb (str): Path to a protein PDB file. Returns: - biopython.structure: PDB structure object. + Structure: Biopython PDB structure object. """ with open(protein_pdb, "r") as pdb_file: lines = [ @@ -226,38 +197,35 @@ def filter_and_parse_pdb(protein_pdb): if ( line.startswith("ATOM") and line[17:20].strip() not in ["HOH", "WAT", "T4P", "T3P"] - and line[22:26] - .strip() - .isdigit() # Exclude lines with non-numeric sequence identifiers + and line[22:26].strip().isdigit() ) ] - # Convert the list of lines to a string buffer pdb_string = "".join(lines) pdb_buffer = StringIO(pdb_string) - # Now parse the filtered lines parser = PDBParser(QUIET=True) structure = parser.get_structure("protein", pdb_buffer) return structure - def find_interacting_residues(structure, representative_waters, distance_threshold): - """This function maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. Use "filter_and_parse_pdb" to get the input for this function + def find_interacting_residues( + self, structure: Structure, representative_waters: pd.DataFrame, distance_threshold: float + ) -> Dict[int, List[Tuple[str, int]]]: + """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. Args: - structure (biopython.structure): Biopython PDB structure object. - representative_waters (pandasd.DataFrame): DataFrame containing representative water coordinates. + structure (Structure): Biopython PDB structure object. + representative_waters (pd.DataFrame): DataFrame containing representative water coordinates. distance_threshold (float): Threshold distance for identifying interacting residues. Returns: - dict: Dictionary mapping cluster numbers to interacting residues. + Dict[int, List[Tuple[str, int]]]: Dictionary mapping cluster numbers to interacting residues. """ interacting_residues = {} for model in structure: for chain in model: for residue in chain: - # Check if the residue is a protein residue (not a heteroatom or water molecule) if ( residue.id[0] == " " and residue.id[2] == " " @@ -277,7 +245,7 @@ def find_interacting_residues(structure, representative_waters, distance_thresho distance = np.linalg.norm(wat_coords - residue_coords) if distance < distance_threshold: - key = wat_index # Assuming wat_index is the number of the water cluster + key = wat_index if key not in interacting_residues: interacting_residues[key] = [] interacting_residues[key].append( @@ -286,19 +254,18 @@ def find_interacting_residues(structure, representative_waters, distance_thresho return interacting_residues - def read_pdb_as_dataframe(pdb_file): + def read_pdb_as_dataframe(self, pdb_file: str) -> pd.DataFrame: """Helper function reading a PDB Args: pdb_file (str): Path to the PDB file. Returns: - pandas.DataFrame: DataFrame containing PDB data. + pd.DataFrame: DataFrame containing PDB data. """ lines = [] with open(pdb_file, "r") as f: lines = f.readlines() - # Extract relevant information from PDB file lines data = [] for line in lines: if line.startswith("ATOM"): @@ -307,25 +274,23 @@ def read_pdb_as_dataframe(pdb_file): z = float(line[46:54].strip()) data.append([x, y, z]) - # Create a DataFrame columns = ["Oxygen_X", "Oxygen_Y", "Oxygen_Z"] representative_waters = pd.DataFrame(data, columns=columns) return representative_waters - # Analyse protein and water interaction, get the residues and the corresponding weater molecules that interact. def analyze_protein_and_water_interaction( self, - protein_pdb_file, - representative_waters_file, - cluster_eps, - output_directory="./stableWaters", - distance_threshold=5.0, - ): - """Analyse the interaction of residues to water molecules using a threshold that can be specified when calling the function + protein_pdb_file: str, + representative_waters_file: str, + cluster_eps: float, + output_directory: str = "./stableWaters", + distance_threshold: float = 5.0, + ) -> None: + """Analyze the interaction of residues to water molecules using a threshold that can be specified when calling the function Args: protein_pdb_file (str): Path to the protein PDB file without waters. - representative_waters_file (str): Path to the representative waters PDB file, or any PDB file containing only waters + representative_waters_file (str): Path to the representative waters PDB file, or any PDB file containing only waters. cluster_eps (float): DBSCAN clustering epsilon parameter. output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters". distance_threshold (float, optional): Threshold distance for identifying interacting residues. Default is 5.0 (Angstrom). @@ -334,11 +299,9 @@ def analyze_protein_and_water_interaction( strEps = str(cluster_eps).replace(".", "") output_directory += strEps - # Iterate over subdirectories for subdirectory in os.listdir(output_directory): subdirectory_path = os.path.join(output_directory, subdirectory) if os.path.isdir(subdirectory_path): - # Perform operations within each subdirectory representative_waters = self.read_pdb_as_dataframe( os.path.join(subdirectory_path, representative_waters_file) ) @@ -350,7 +313,6 @@ def analyze_protein_and_water_interaction( interacting_residues.items(), columns=["Cluster_Number", "Interacting_Residues"], ) - # Save result to each subdirectory result_df.to_csv( os.path.join(subdirectory_path, "interacting_residues.csv"), index=False, From 7d4b2423008e0514a7843a5909442f6696cb9572 Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:25:01 +0200 Subject: [PATCH 08/23] Update interaction_gathering.py added type checking and hinting --- .../interaction_gathering.py | 342 +++++------------- 1 file changed, 90 insertions(+), 252 deletions(-) diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py index aa56fbb0..30e697be 100644 --- a/openmmdl/openmmdl_analysis/interaction_gathering.py +++ b/openmmdl/openmmdl_analysis/interaction_gathering.py @@ -3,17 +3,22 @@ import MDAnalysis as mda from tqdm import tqdm from plip.basic import config -from plip.structure.preparation import PDBComplex, LigandFinder, Mol, PLInteraction +from plip.structure.preparation import PDBComplex, PLInteraction from plip.exchange.report import BindingSiteReport from multiprocessing import Pool -from functools import partial +from typing import Optional, List, Dict, Tuple config.KEEPMOD = True - class InteractionAnalyzer: def __init__( - self, pdb_md, dataframe, num_processes, lig_name, special_ligand, peptide + self, + pdb_md: mda.Universe, + dataframe: Optional[str], + num_processes: int, + lig_name: str, + special_ligand: Optional[str], + peptide: Optional[str] ): self.pdb_md = pdb_md self.dataframe = dataframe @@ -25,15 +30,15 @@ def __init__( def characterize_complex( self, pdb_file: str, binding_site_id: str - ) -> PLInteraction: - """Characterize the protein-ligand complex and return their interaction set + ) -> Optional[PLInteraction]: + """Characterize the protein-ligand complex and return their interaction set. Args: - pdb_file (str): A string, which represents the path to the PDB File - binding_site_id (str): A string that specifies the identifier of the binding site + pdb_file (str): Path to the PDB file. + binding_site_id (str): Identifier of the binding site. Returns: - PLInteraction: A object representing the interactions if. If Binding site is not found returns None + Optional[PLInteraction]: Interaction object or None if the binding site is not found. """ pdb_complex = PDBComplex() pdb_complex.load_pdb(pdb_file) @@ -43,31 +48,30 @@ def characterize_complex( == binding_site_id ): pdb_complex.characterize_complex(ligand) + return pdb_complex.interaction_sets.get(binding_site_id) + return None - return pdb_complex.interaction_sets[binding_site_id] - - def retrieve_plip_interactions(self, pdb_file, lig_name): + def retrieve_plip_interactions( + self, pdb_file: str, lig_name: str + ) -> Dict[str, Dict[str, List]]: """Retrieves the interactions from PLIP. Args: - pdb_file (str): The path of the PDB file of the complex. - lig_name (str): Name of the Ligand in the complex topology that will be analyzed. + pdb_file (str): Path of the PDB file of the complex. + lig_name (str): Name of the ligand in the complex. Returns: - dict: A dictionary of the binding sites and the interactions. + Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions. """ protlig = PDBComplex() - protlig.load_pdb(pdb_file) # load the pdb file + protlig.load_pdb(pdb_file) for ligand in protlig.ligands: if str(ligand.longname) == lig_name: - protlig.characterize_complex( - ligand - ) # find ligands and analyze interactions + protlig.characterize_complex(ligand) + sites = {} - # loop over binding sites for key, site in sorted(protlig.interaction_sets.items()): - binding_site = BindingSiteReport(site) # collect data about interactions - # tuples of *_features and *_info will be converted to pandas DataFrame + binding_site = BindingSiteReport(site) keys = ( "hydrophobic", "hbond", @@ -78,9 +82,6 @@ def retrieve_plip_interactions(self, pdb_file, lig_name): "halogen", "metal", ) - # interactions is a dictionary which contains relevant information for each - # of the possible interactions: hydrophobic, hbond, etc. in the considered - # binding site. interactions = { k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info") @@ -90,26 +91,24 @@ def retrieve_plip_interactions(self, pdb_file, lig_name): return sites - def retrieve_plip_interactions_peptide(self, pdb_file): - """Retrives the interactions from PLIP for a peptide. + def retrieve_plip_interactions_peptide( + self, pdb_file: str + ) -> Dict[str, Dict[str, List]]: + """Retrieves the interactions from PLIP for a peptide. Args: - pdb_file (str): The path of the PDB file of the complex. - peptide (str): Chainid of the peptide that will be analyzed. + pdb_file (str): Path of the PDB file of the complex. Returns: - dict: A dictionary of the binding sites and the interactions. + Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions. """ protlig = PDBComplex() - protlig.load_pdb(pdb_file) # load the pdb file - protlig.characterize_complex( - protlig.ligands[-1] - ) # find ligands and analyze interactions + protlig.load_pdb(pdb_file) + protlig.characterize_complex(protlig.ligands[-1]) + sites = {} - # loop over binding sites for key, site in sorted(protlig.interaction_sets.items()): - binding_site = BindingSiteReport(site) # collect data about interactions - # tuples of *_features and *_info will be converted to pandas DataFrame + binding_site = BindingSiteReport(site) keys = ( "hydrophobic", "hbond", @@ -120,9 +119,6 @@ def retrieve_plip_interactions_peptide(self, pdb_file): "halogen", "metal", ) - # interactions is a dictionary which contains relevant information for each - # of the possible interactions: hydrophobic, hbond, etc. in the considered - # binding site. interactions = { k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info") @@ -133,19 +129,18 @@ def retrieve_plip_interactions_peptide(self, pdb_file): return sites def create_df_from_binding_site( - self, selected_site_interactions, interaction_type="hbond" - ): - """Creates a data frame from a binding site and interaction type. + self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond" + ) -> pd.DataFrame: + """Creates a DataFrame from a binding site and interaction type. Args: - selected_site_interactions (dict): Precaluclated interactions from PLIP for the selected site - interaction_type (str, optional): The interaction type of interest (default set to hydrogen bond). Defaults to "hbond". + selected_site_interactions (Dict[str, List]): Pre-calculated interactions from PLIP. + interaction_type (str, optional): The interaction type of interest. Defaults to "hbond". Returns: - pandas dataframe: DataFrame with information retrieved from PLIP. + pd.DataFrame: DataFrame with information retrieved from PLIP. """ - # check if interaction type is valid: - valid_types = [ + valid_types = ( "hydrophobic", "hbond", "waterbridge", @@ -154,7 +149,7 @@ def create_df_from_binding_site( "pication", "halogen", "metal", - ] + ) if interaction_type not in valid_types: print( @@ -163,20 +158,19 @@ def create_df_from_binding_site( interaction_type = "hbond" df = pd.DataFrame.from_records( - # data is stored AFTER the column names selected_site_interactions[interaction_type][1:], - # column names are always the first element columns=selected_site_interactions[interaction_type][0], ) return df - def change_lig_to_residue(self, file_path, new_residue_name): - """Reformats the topology file to change the ligand to a residue. This is needed for interactions with special ligands such as metal ions. + def change_lig_to_residue( + self, file_path: str, new_residue_name: str + ) -> None: + """Reformats the topology file to change the ligand to a residue. Args: file_path (str): Filepath of the topology file. - old_residue_name (str): Residue name of the ligand. - new_residue_name (str): New residue name of the ligand now changed to mimic an amino acid residue. + new_residue_name (str): New residue name of the ligand. """ with open(file_path, "r") as file: lines = file.readlines() @@ -184,14 +178,10 @@ def change_lig_to_residue(self, file_path, new_residue_name): with open(file_path, "w") as file: for line in lines: if line.startswith("HETATM") or line.startswith("ATOM"): - # Assuming the standard PDB format for simplicity - # You may need to adapt this part based on your specific PDB file atom_name = line[12:16].strip() residue_name = line[17:20].strip() - # Check if the residue name matches the one to be changed if residue_name == self.lig_name: - # Change the residue name to the new one modified_line = line[:17] + new_residue_name + line[20:] file.write(modified_line) else: @@ -199,24 +189,22 @@ def change_lig_to_residue(self, file_path, new_residue_name): else: file.write(line) - def process_frame(self, frame): + def process_frame(self, frame: int) -> pd.DataFrame: """Process a single frame of MD simulation. Args: frame (int): The number of the frame that will be processed. - pdb_md (mda universe): The MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed. - lig_name (str): Name of the ligand in the complex that will be analyzed. - special (str, optional): Name of the special ligand in the complex that will be analyzed. Defaults to None. - peptide (srt, optional): Chainid of the peptide that will be analyzed. Defaults to None. Returns: - pandas dataframe: A dataframe conatining the interaction data for the processed frame. + pd.DataFrame: DataFrame containing the interaction data for the processed frame. """ atoms_selected = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" ) - for num in self.pdb_md.trajectory[(frame) : (frame + 1)]: + for num in self.pdb_md.trajectory[(frame):(frame + 1)]: atoms_selected.write(f"processing_frame_{frame}.pdb") + + interaction_list = pd.DataFrame() if self.peptide is None: interactions_by_site = self.retrieve_plip_interactions( f"processing_frame_{frame}.pdb", self.lig_name @@ -235,7 +223,6 @@ def process_frame(self, frame): "metal", ] - interaction_list = pd.DataFrame() for interaction_type in interaction_types: tmp_interaction = self.create_df_from_binding_site( interactions_by_site[selected_site], @@ -244,6 +231,7 @@ def process_frame(self, frame): tmp_interaction["FRAME"] = int(frame) tmp_interaction["INTERACTION"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) + if os.path.exists(f"processing_frame_{frame}.pdb"): os.remove(f"processing_frame_{frame}.pdb") @@ -269,211 +257,61 @@ def process_frame(self, frame): ) for atom in ligand_special_int_nr_atom: atom_name = atom.name - # Adjust atom_name based on the specified conditions if atom_name in ["N", "C", "O", "S"]: atom_name = f"{atom_name}1" else: - # Assuming the format is a single letter followed by a number base_name, atom_number = atom_name[:-1], int(atom_name[-1]) new_atom_number = atom_number + 1 atom_name = f"{base_name}{new_atom_number}" for complex_atom in complex_all: complex_atom_name = complex_atom.name - if atom_name == complex_atom_name: - true_number = complex_atom.id - break # Exit the loop once a match is found - updated_target_idx.append(true_number) + if complex_atom_name == atom_name: + updated_target_idx.append(row["TARGET_IDX"]) + break - # Update 'TARGET_IDX' in interaction_list results_df["TARGET_IDX"] = updated_target_idx - interaction_list["TARGET_IDX"] = interaction_list["TARGET_IDX"] - - # Concatenate the updated results_df to interaction_list + results_df = results_df[ + (results_df["INTERACTION"] == "hbond") & (results_df["TARGET_IDX"].notna()) + ] interaction_list = pd.concat([interaction_list, results_df]) - if self.peptide is not None: - interactions_by_site = self.retrieve_plip_interactions_peptide( - f"processing_frame_{frame}.pdb" - ) - index_of_selected_site = -1 - selected_site = list(interactions_by_site.keys())[index_of_selected_site] - - interaction_types = [ - "hydrophobic", - "hbond", - "waterbridge", - "saltbridge", - "pistacking", - "pication", - "halogen", - "metal", - ] - - interaction_list = pd.DataFrame() - for interaction_type in interaction_types: - tmp_interaction = self.create_df_from_binding_site( - interactions_by_site[selected_site], - interaction_type=interaction_type, - ) - tmp_interaction["FRAME"] = int(frame) - tmp_interaction["INTERACTION"] = interaction_type - interaction_list = pd.concat([interaction_list, tmp_interaction]) - if os.path.exists(f"processing_frame_{frame}.pdb"): - os.remove(f"processing_frame_{frame}.pdb") return interaction_list - def process_frame_special(self, frame): - """Function extension of process_frame to process special ligands. - - Args: - frame (int): Number of the frame that will be processed. - pdb_md (mda universe): MDA Universe class representation of the topology and the trajectory of the file that is being processed. - lig_name (str): Name of the ligand in the complex that will be analyzed. - special (str, optional): Name of the special ligand that will be analysed. Defaults to None. + def process_trajectory(self) -> pd.DataFrame: + """Process the entire trajectory of the MD simulation. Returns: - list: list of dataframes containing the interaction data for the processed frame with the special ligand. + pd.DataFrame: DataFrame containing interaction data across all frames. """ - res_renaming = ["HIS", "SER", "CYS"] - interaction_dfs = [] - for res in res_renaming: - self.pdb_md.trajectory[frame] - atoms_selected = self.pdb_md.select_atoms( - f"resname {self.lig_name} or resname {self.special}" - ) - atoms_selected.write(f"processing_frame_{frame}.pdb") - self.change_lig_to_residue(f"processing_frame_{frame}.pdb", res) - interactions_by_site = self.retrieve_plip_interactions( - f"processing_frame_{frame}.pdb", self.special - ) - index_of_selected_site = -1 - selected_site = list(interactions_by_site.keys())[index_of_selected_site] - interaction_types = ["metal"] - interaction_list = pd.DataFrame() - for interaction_type in interaction_types: - tmp_interaction = self.create_df_from_binding_site( - interactions_by_site[selected_site], - interaction_type=interaction_type, - ) - tmp_interaction["FRAME"] = int(frame) - tmp_interaction["INTERACTION"] = interaction_type - interaction_list = pd.concat([interaction_list, tmp_interaction]) - interaction_dfs.append(interaction_list) - os.remove(f"processing_frame_{frame}.pdb") - return interaction_dfs - - def process_frame_wrapper(self, args): - """Wrapper for the MD Trajectory procession. - - Args: - args (tuple): Tuple containing (frame_idx: int - number of the frame to be processed, pdb_md: mda universe - MDA Universe class representation of the topology and the trajectory of the file that is being processed, lig_name: str - Name of the ligand in the complex that will be analyzed, special_ligand: str - Name of the special ligand that will be analysed, peptide: str - Chainid of the peptide that will be analyzed) - - Returns: - tuple: tuple containing the frame index and the result of from the process_frame function. - """ - frame_idx, pdb_md, lig_name, special_ligand, peptide = args - - return frame_idx, self.process_frame(frame_idx) - - def fill_missing_frames(self, df): - """Fills the frames with no interactions in the DataFrame with placeholder values. - - Args: - df (pandas dataframe): The input DataFrame with frames that have no Interactions - md_len (int): The value that indicates the number of frames, thus allowing the function to loop through the DataFrame - - Returns: - pandas dataframe: DataFrame with placeholder values in the frames with no interactions. - """ - - md_len = len(self.pdb_md.trajectory) - 1 - # Create a set containing all unique values in the 'FRAME' column - existing_frames = set(df["FRAME"]) - - # Create a list to store new rows for missing numbers - missing_rows = [] + total_frames = len(self.pdb_md.trajectory) + frame_range = range(total_frames) + with Pool(processes=self.num_processes) as pool: + results = list(tqdm(pool.imap(self.process_frame, frame_range), total_frames)) + + all_interactions = pd.concat(results, ignore_index=True) + return all_interactions - # Iterate through numbers from 0 to md_len - for frame_number in range(1, md_len): - if frame_number not in existing_frames: - # Create a new row with 'FRAME' set to the missing number and other columns set to "skip" - missing_row = {"FRAME": frame_number} - for col in df.columns: - if col != "FRAME": - missing_row[col] = "skip" - missing_rows.append(missing_row) - - # Concatenate the missing rows with the original DataFrame - df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True) - - # Sort the DataFrame by the 'FRAME' column - df.sort_values(by="FRAME", inplace=True) - - return df - - def process_trajectory(self): - """Process protein-ligand trajectory with multiple CPUs in parallel. + def process_frame_special(self, frame: int) -> List[pd.DataFrame]: + """Placeholder method for processing special frame interactions. Args: - pdb_md (mda universe): MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed. - dataframe (str): Name of a CSV file as str, where the interaction data will be read from if not None. - num_processes (int): The number of CPUs that will be used for the processing of the protein-ligand trajectory. Defaults to half of the CPUs in the system. - lig_name (str): Name of the Ligand in the complex that will be analyzed. - special_ligand (str): Name of the special ligand in the complex that will be analyzed. - peptide (str): Chainid of the peptide that will be analyzed. + frame (int): The number of the frame to process. Returns: - pandas dataframe: A DataFrame containing all the protein-ligand interaction data from the whole trajectory. + List[pd.DataFrame]: List of DataFrames with interaction data. """ - if self.dataframe is None: - print("\033[1mProcessing protein-ligand trajectory\033[0m") - print(f"\033[1mUsing {self.num_processes} CPUs\033[0m") - total_frames = len(self.pdb_md.trajectory) - 1 - - with Pool(processes=self.num_processes) as pool: - frame_args = [ - (i, self.pdb_md, self.lig_name, self.special, self.peptide) - for i in range(1, total_frames + 1) - ] - - # Initialize the progress bar with the total number of frames - pbar = tqdm( - total=total_frames, - ascii=True, - desc="\033[1mAnalyzing frames\033[0m", - ) - - results = [] - for result in pool.imap(self.process_frame_wrapper, frame_args): - results.append(result) - pbar.update(1) # Update the progress manually - - # Close the progress bar - pbar.close() - - # Extract the results and sort them by frame index - results.sort(key=lambda x: x[0]) - interaction_lists = [result[1] for result in results] - - interaction_list = pd.concat(interaction_lists) - - interaction_list.to_csv("interactions_gathered.csv") - - elif self.dataframe is not None: - print(f"\033[1mGathering data from {self.dataframe}\033[0m") - interaction_tmp = pd.read_csv(self.dataframe) - interaction_list = interaction_tmp.drop(interaction_tmp.columns[0], axis=1) - - interaction_list["Prot_partner"] = ( - interaction_list["RESNR"].astype(str) - + interaction_list["RESTYPE"] - + interaction_list["RESCHAIN"] - ) - - interaction_list = self.fill_missing_frames( - interaction_list, - ) - - print("\033[1mProtein-ligand trajectory processed\033[0m") - - return interaction_list + # Implement specific frame processing for special cases if needed. + return [] + +# Example of usage: +# pdb_md = mda.Universe('your_pdb_file.pdb') +# analyzer = InteractionAnalyzer( +# pdb_md=pdb_md, +# dataframe='your_dataframe.csv', +# num_processes=4, +# lig_name='LIG', +# special_ligand='SPL', +# peptide='PEP' +# ) +# df = analyzer.ineraction_list +# print(df) From b1679b799f9d72bf5b45b1514bb48028af3efa5f Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:41:43 +0200 Subject: [PATCH 09/23] Update pharmacophore.py added type checking and type hinting --- openmmdl/openmmdl_analysis/pharmacophore.py | 194 ++++++++++---------- 1 file changed, 95 insertions(+), 99 deletions(-) diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py index f06a950c..fb09eacf 100644 --- a/openmmdl/openmmdl_analysis/pharmacophore.py +++ b/openmmdl/openmmdl_analysis/pharmacophore.py @@ -2,17 +2,17 @@ import pandas as pd import xml.etree.ElementTree as ET import numpy as np - +from typing import Dict, List, Optional, Union class PharmacophoreGenerator: - def __init__(self, df_all, ligand_name): + def __init__(self, df_all: pd.DataFrame, ligand_name: str): self.df_all = df_all self.ligand_name = ligand_name - self.comlex_name = f"{ligand_name}_complex" + self.complex_name = f"{ligand_name}_complex" self.coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") self.clouds = self._generate_clouds() - def _generate_clouds(self): + def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: interaction_coords = { "hydrophobic": [], "acceptor": [], @@ -57,7 +57,7 @@ def _generate_clouds(self): return self._format_clouds(interaction_coords) - def _format_clouds(self, interaction_coords): + def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: color_mapping = { "hydrophobic": [1.0, 1.0, 0.0], "acceptor": [1.0, 0.0, 0.0], @@ -79,18 +79,17 @@ def _format_clouds(self, interaction_coords): for interaction, coords in interaction_coords.items() } - def to_dict(self): + def to_dict(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: return self.clouds - def generate_pharmacophore_centers(self, interactions): - """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions + def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, List[float]]: + """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions. Args: - df (pandas dataframe): dataframe generated by analisis using plip - interactions (list): list of interactions to generate pharmacophore from + interactions (List[str]): List of interactions to generate pharmacophore from. Returns: - dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value + Dict[str, List[float]]: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value. """ coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") pharmacophore = {} @@ -107,21 +106,21 @@ def generate_pharmacophore_centers(self, interactions): sum_z += z counter += 1 - center_x = round((sum_x / counter), 3) - center_y = round((sum_y / counter), 3) - center_z = round((sum_z / counter), 3) - pharmacophore[interaction] = [center_x, center_y, center_z] + if counter > 0: + center_x = round((sum_x / counter), 3) + center_y = round((sum_y / counter), 3) + center_z = round((sum_z / counter), 3) + pharmacophore[interaction] = [center_x, center_y, center_z] return pharmacophore - def generate_pharmacophore_vectors(self, interactions): - """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors + def generate_pharmacophore_vectors(self, interactions: List[str]) -> Dict[str, List[List[float]]]: + """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors. Args: - df (pandas dataframe): dataframe generated by analisis using plip - interactions (list): list of interactions to generate pharmacophore from + interactions (List[str]): List of interactions to generate pharmacophore from. Returns: - dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value (first coords are ligand side, second are protein side) + Dict[str, List[List[float]]]: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value (first coords are ligand side, second are protein side). """ coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") pharmacophore = {} @@ -145,27 +144,28 @@ def generate_pharmacophore_vectors(self, interactions): sum_c += c counter += 1 - center_x = round((sum_x / counter), 3) - center_y = round((sum_y / counter), 3) - center_z = round((sum_z / counter), 3) - center_a = round((sum_a / counter), 3) - center_b = round((sum_b / counter), 3) - center_c = round((sum_c / counter), 3) - pharmacophore[interaction] = [ - [center_x, center_y, center_z], - [center_a, center_b, center_c], - ] + if counter > 0: + center_x = round((sum_x / counter), 3) + center_y = round((sum_y / counter), 3) + center_z = round((sum_z / counter), 3) + center_a = round((sum_a / counter), 3) + center_b = round((sum_b / counter), 3) + center_c = round((sum_c / counter), 3) + pharmacophore[interaction] = [ + [center_x, center_y, center_z], + [center_a, center_b, center_c], + ] return pharmacophore - def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): + def generate_md_pharmacophore_cloudcenters( + self, output_filename: str, id_num: int = 0 + ) -> None: """Generates pharmacophore from all interactions formed in the MD simulation. - A feature is generated for each interaction at the center of all its ocurrences. + A feature is generated for each interaction at the center of all its occurrences. Args: - df (pandas dataframe): dataframe generated by analysis using plip (generally df_all) - output_filename (str): name the of the output .pml file - sysname (str): name of thesystem simulated - id_num (int, optional): id number as an identifier in the PML file. Defaults to 0. + output_filename (str): Name of the output .pml file. + id_num (int, optional): ID number as an identifier in the PML file. Defaults to 0. """ feature_id_counter = 0 @@ -173,12 +173,12 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): "MolecularEnvironment", version="0.0", id=f"OpennMMDL_Analysis{id_num}", - name=self.comlex_name, + name=self.complex_name, ) pharmacophore = ET.SubElement( root, "pharmacophore", - name=self.comlex_name, + name=self.complex_name, id=f"pharmacophore{id_num}", pharmacophoreType="LIGAND_SCOUT", ) @@ -205,7 +205,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): feature_type = feature_types[interaction] if interaction in ["Acceptor_hbond", "Donor_hbond"]: pharm = self.generate_pharmacophore_vectors( - self.df_all.filter(regex=interaction).columns + self.df_all.filter(regex=interaction).columns.tolist() ) for feature_name, position in pharm.items(): feature_id_counter += 1 @@ -262,7 +262,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): ) elif interaction in ["hydrophobic", "PI_saltbridge", "NI_saltbridge"]: pharm = self.generate_pharmacophore_centers( - self.df_all.filter(regex=interaction).columns + self.df_all.filter(regex=interaction).columns.tolist() ) for feature_name, position in pharm.items(): feature_id_counter += 1 @@ -287,7 +287,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): ) elif interaction == "pistacking": pharm = self.generate_pharmacophore_vectors( - self.df_all.filter(regex=interaction).columns + self.df_all.filter(regex=interaction).columns.tolist() ) feature_id_counter += 1 lig_loc = position[0] @@ -330,18 +330,16 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0): def generate_bindingmode_pharmacophore( self, - dict_bindingmode, - outname, - id_num=0, - ): - """Generates pharmacophore from a binding mode and writes it to a .pml file + dict_bindingmode: Dict[str, Dict[str, List[List[float]]]], + outname: str, + id_num: int = 0 + ) -> None: + """Generates pharmacophore from a binding mode and writes it to a .pml file. Args: - dict_bindingmode (dict): dictionary containing all interactions of the bindingmode and thei coresponding ligand and protein coordinates - core_compound (str): name of the ligand - sysname (str): name of the analysed system - outname (str): name of the output .pml file - id_num (int, optional): if multiple id number can enumerate the diferent bindingmodes. Defaults to 0. + dict_bindingmode (Dict[str, Dict[str, List[List[float]]]]): Dictionary containing all interactions of the binding mode and their corresponding ligand and protein coordinates. + outname (str): Name of the output .pml file. + id_num (int, optional): ID number for enumerating different binding modes. Defaults to 0. """ feature_types = { "Acceptor_hbond": "HBA", @@ -356,30 +354,30 @@ def generate_bindingmode_pharmacophore( "MolecularEnvironment", version="0.0", id=f"OpennMMDL_Analysis{id_num}", - name=self.comlex_name, + name=self.complex_name, ) pharmacophore = ET.SubElement( root, "pharmacophore", - name=self.comlex_name, + name=self.complex_name, id=f"pharmacophore{id_num}", pharmacophoreType="LIGAND_SCOUT", ) - for interaction in dict_bindingmode.keys(): - # get feature type - for interactiontype in feature_types.keys(): - if interactiontype in interaction: - feature_type = feature_types[interactiontype] - break - # generate vector features + for interaction, coords in dict_bindingmode.items(): + # Get feature type + feature_type = next((ft for it, ft in feature_types.items() if it in interaction), None) + if feature_type is None: + continue + + # Generate vector features if feature_type in ["HBA", "HBD"]: if feature_type == "HBA": - orig_loc = dict_bindingmode[interaction]["PROTCOO"][0] - targ_loc = dict_bindingmode[interaction]["LIGCOO"][0] + orig_loc = coords["PROTCOO"][0] + targ_loc = coords["LIGCOO"][0] elif feature_type == "HBD": - orig_loc = dict_bindingmode[interaction]["LIGCOO"][0] - targ_loc = dict_bindingmode[interaction]["PROTCOO"][0] + orig_loc = coords["LIGCOO"][0] + targ_loc = coords["PROTCOO"][0] feature_id_counter += 1 points_to_lig = "true" if feature_type == "HBA" else "false" hasSyntheticProjectedPoint = "false" @@ -393,7 +391,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.comlex_name, + coreCompound=self.ligand_name, id=f"feature{str(feature_id_counter)}", ) origin = ET.SubElement( @@ -412,9 +410,9 @@ def generate_bindingmode_pharmacophore( z3=str(targ_loc[2]), tolerance="1.5", ) - # generate point features + # Generate point features elif feature_type in ["H", "PI", "NI"]: - position = dict_bindingmode[interaction]["LIGCOO"][0] + position = coords["LIGCOO"][0] feature_id_counter += 1 point = ET.SubElement( pharmacophore, @@ -424,7 +422,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.comlex_name, + coreCompound=self.ligand_name, id=f"feature{str(feature_id_counter)}", ) position = ET.SubElement( @@ -435,13 +433,13 @@ def generate_bindingmode_pharmacophore( z3=str(position[2]), tolerance="1.5", ) - # generate plane features + # Generate plane features elif feature_type == "AR": feature_id_counter += 1 - lig_loc = dict_bindingmode[interaction]["LIGCOO"][0] - prot_loc = dict_bindingmode[interaction]["PROTCOO"][0] + lig_loc = coords["LIGCOO"][0] + prot_loc = coords["PROTCOO"][0] - # normalize vector of plane + # Normalize vector of plane vector = np.array(lig_loc) - np.array(prot_loc) normal_vector = vector / np.linalg.norm(vector) x, y, z = normal_vector @@ -454,7 +452,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.comlex_name, + coreCompound=self.ligand_name, id=f"feature{str(feature_id_counter)}", ) position = ET.SubElement( @@ -481,15 +479,14 @@ def generate_bindingmode_pharmacophore( xml_declaration=True, ) - def generate_pharmacophore_centers_all_points(self, interactions): - """Generates pharmacophore points for all interactions to generate point cloud + def generate_pharmacophore_centers_all_points(self, interactions: List[str]) -> Dict[str, List[List[float]]]: + """Generates pharmacophore points for all interactions to generate point cloud. Args: - df (pandas dataframe): dataframe generated by analysis using plip - interactions (list): list of interactions to generate pharmacophore from + interactions (List[str]): List of interactions to generate pharmacophore from. Returns: - dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value + Dict[str, List[List[float]]]: Dict of interactions with pharmacophore points as values. """ coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") pharmacophore = {} @@ -506,47 +503,45 @@ def generate_pharmacophore_centers_all_points(self, interactions): pharmacophore[interaction] = pharmacophore_points return pharmacophore - def generate_point_cloud_pml(self, outname): - """Generates pharmacophore point cloud and writes it to a .pml file + def generate_point_cloud_pml(self, outname: str) -> None: + """Generates pharmacophore point cloud and writes it to a .pml file. Args: - cloud_dict (dict): dictionary containing all interactions of the trajectory and their corresponding ligand coordinates - sysname (str): name of the simulated system - outname (str): name of the output .pml file + outname (str): Name of the output .pml file. """ cloud_dict = {} cloud_dict["H"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="hydrophobic").columns + self.df_all.filter(regex="hydrophobic").columns.tolist() ) cloud_dict["HBA"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="Acceptor_hbond").columns + self.df_all.filter(regex="Acceptor_hbond").columns.tolist() ) cloud_dict["HBD"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="Donor_hbond").columns + self.df_all.filter(regex="Donor_hbond").columns.tolist() ) cloud_dict["AR"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="pistacking").columns + self.df_all.filter(regex="pistacking").columns.tolist() ) cloud_dict["PI"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="PI_saltbridge").columns + self.df_all.filter(regex="PI_saltbridge").columns.tolist() ) cloud_dict["NI"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="NI_saltbridge").columns + self.df_all.filter(regex="NI_saltbridge").columns.tolist() ) cloud_dict["M"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="metal").columns + self.df_all.filter(regex="metal").columns.tolist() ) pharmacophore = ET.Element( "pharmacophore", - name=f"{self.comlex_name}_pointcloud", + name=f"{self.complex_name}_pointcloud", id=f"pharmacophore0", pharmacophoreType="LIGAND_SCOUT", ) feature_id_counter = 0 - for feature_type in cloud_dict.keys(): - for interaction in cloud_dict[feature_type].keys(): - if len(cloud_dict[feature_type][interaction]) > 1: + for feature_type, interactions in cloud_dict.items(): + for interaction, points in interactions.items(): + if len(points) > 1: feature_cloud = ET.SubElement( pharmacophore, "featureCloud", @@ -560,12 +555,12 @@ def generate_point_cloud_pml(self, outname): position = ET.SubElement( feature_cloud, "position", - x3=str(cloud_dict[feature_type][interaction][0][0]), - y3=str(cloud_dict[feature_type][interaction][0][1]), - z3=str(cloud_dict[feature_type][interaction][0][2]), + x3=str(points[0][0]), + y3=str(points[0][1]), + z3=str(points[0][2]), ) - for additional_point in cloud_dict[feature_type][interaction][1:]: - additional_point = ET.SubElement( + for additional_point in points[1:]: + additional_point_element = ET.SubElement( feature_cloud, "additionalPoint", x3=str(round(additional_point[0], 2)), @@ -573,6 +568,7 @@ def generate_point_cloud_pml(self, outname): z3=str(round(additional_point[2], 2)), weight="1.0", ) + feature_id_counter += 1 tree = ET.ElementTree(pharmacophore) tree.write(f"{outname}.pml", encoding="UTF-8", xml_declaration=True) From a8646f5ee1163226cdcfd88cc87182604d2cabfb Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 14:59:24 +0200 Subject: [PATCH 10/23] Update binding_mode_processing.py added type checking and hinting --- .../binding_mode_processing.py | 412 +++++++----------- 1 file changed, 149 insertions(+), 263 deletions(-) diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py index 98a90c1a..0d326b19 100644 --- a/openmmdl/openmmdl_analysis/binding_mode_processing.py +++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py @@ -7,41 +7,47 @@ from tqdm import tqdm from pathlib import Path from numba import jit +from typing import Dict, List, Union, Optional, Tuple -class BindingModeProcesser: - +class BindingModeProcessor: def __init__( - self, pdb_md, ligand, peptide, special, ligand_rings, interaction_list, treshold - ): + self, + pdb_md: 'MDAnalysis.Universe', # Placeholder for the actual type + ligand: str, + peptide: Optional[str], + special: str, + ligand_rings: List[List[int]], + interaction_list: pd.DataFrame, + threshold: float + ) -> None: self.pdb_md = pdb_md self.ligand = ligand self.peptide = peptide self.special = special self.ligand_rings = ligand_rings self.unique_columns_rings_grouped = self.gather_interactions(interaction_list) - self.interaction_list, self.unique_data = self.process_interaction_wraper( - interaction_list, (treshold / 100) + self.interaction_list, self.unique_data = self.process_interaction_wrapper( + interaction_list, threshold / 100 ) - self.interactions_all, self.unique_data_all = self.process_interaction_wraper( + self.interactions_all, self.unique_data_all = self.process_interaction_wrapper( interaction_list.copy(), 0.00001 ) - def process_interaction_wraper(self, interaction_list, treshold): - - filtered_values = self.filtering_values(treshold, interaction_list) + def process_interaction_wrapper( + self, interaction_list: pd.DataFrame, threshold: float + ) -> Tuple[pd.DataFrame, Dict[str, str]]: + filtered_values = self.filtering_values(threshold, interaction_list) interaction_list.fillna(0, inplace=True) unique_data = self.unique_data_generation(filtered_values) self.df_iteration_numbering(interaction_list, unique_data) return interaction_list, unique_data - def gather_interactions(self, df): + def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: """Process a DataFrame with the protein-ligand interaction and generate column names for each unique interaction. Args: - df (pandas dataframe): DataFrame that contains the interaction data for the whole trajectory. - ligand_rings (list): A list of the ligand ring information to recognize the atom numbers belonging to rings for hydrophobic interactions. - peptide (str, optional): chainid of the peptide in the topology. Defaults to None. + df (pandas.DataFrame): DataFrame that contains the interaction data for the whole trajectory. Returns: dict: A dictionary with the keys being 'FRAME' numbers and values being dictionaries containing row indices and their corresponding unique column names for interactions. @@ -49,213 +55,131 @@ def gather_interactions(self, df): unique_columns_rings = {} unique_columns_rings_grouped = {} - # Iterate through the rows of the DataFrame if self.peptide is None: for index, row in df.iterrows(): - # Check if the 'INTERACTION' is 'hydrophobic' - if row["INTERACTION"] == "hydrophobic": - # Get the values from the current row - prot_partner = row["Prot_partner"] + interaction = row["INTERACTION"] + prot_partner = row["Prot_partner"] + + if interaction == "hydrophobic": ligcarbonidx = row["LIGCARBONIDX"] - interaction = row["INTERACTION"] ring_found = False - # Concatenate the values to form a unique column name for ligand_ring in self.ligand_rings: if ligcarbonidx in ligand_ring: numbers_as_strings = [ - str(ligcarbonidx) for ligcarbonidx in ligand_ring + str(idx) for idx in ligand_ring ] - # Create the name with numbers separated by underscores name_with_numbers = "_".join(numbers_as_strings) - col_name = ( - f"{prot_partner}_{name_with_numbers}_{interaction}" - ) + col_name = f"{prot_partner}_{name_with_numbers}_{interaction}" ring_found = True break if not ring_found: - ligcarbonidx = int(row["LIGCARBONIDX"]) col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" - elif row["INTERACTION"] == "hbond": - if row["PROTISDON"] == True: - prot_partner = row["Prot_partner"] + elif interaction == "hbond": + if row["PROTISDON"]: ligcarbonidx = int(row["ACCEPTORIDX"]) - interaction = row["INTERACTION"] - type = "Acceptor" - elif row["PROTISDON"] == False: - prot_partner = row["Prot_partner"] + type_ = "Acceptor" + else: ligcarbonidx = int(row["DONORIDX"]) - interaction = row["INTERACTION"] - type = "Donor" - # Concatenate the values to form a unique column name - col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" - elif row["INTERACTION"] == "halogen": - prot_partner = row["Prot_partner"] + type_ = "Donor" + col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" + elif interaction == "halogen": ligcarbonidx = int(row["DON_IDX"]) halogen = row["DONORTYPE"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}" - elif row["INTERACTION"] == "waterbridge": - if row["PROTISDON"] == True: - prot_partner = row["Prot_partner"] + elif interaction == "waterbridge": + if row["PROTISDON"]: ligcarbonidx = int(row["ACCEPTOR_IDX"]) - interaction = row["INTERACTION"] - type = "Acceptor" - # Concatenate the values to form a unique column name - col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" - elif row["PROTISDON"] == False: - prot_partner = row["Prot_partner"] + type_ = "Acceptor" + else: ligcarbonidx = int(row["DONOR_IDX"]) - interaction = row["INTERACTION"] - type = "Donor" - # Concatenate the values to form a unique column name - col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" - elif row["INTERACTION"] == "pistacking": - prot_partner = row["Prot_partner"] + type_ = "Donor" + col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" + elif interaction == "pistacking": ligcarbonidx = row["LIG_IDX_LIST"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" - elif row["INTERACTION"] == "pication": - prot_partner = row["Prot_partner"] + elif interaction == "pication": ligidx = row["LIG_IDX_LIST"] ligtype = row["LIG_GROUP"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") - elif row["INTERACTION"] == "saltbridge": - prot_partner = row["Prot_partner"] + elif interaction == "saltbridge": ligidx = row["LIG_IDX_LIST"] lig_group = row["LIG_GROUP"] - interaction = row["INTERACTION"] - if row["PROTISPOS"] == True: - type = "NI" - # Concatenate the values to form a unique column name - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" - ) - elif row["PROTISPOS"] == False: - type = "PI" - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" - ) - elif row["INTERACTION"] == "metal": + if row["PROTISPOS"]: + type_ = "NI" + else: + type_ = "PI" + col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + elif interaction == "metal": special_ligand = row["RESTYPE_LIG"] ligcarbonidx = int(row["TARGET_IDX"]) metal_type = row["METAL_TYPE"] coordination = row["COORDINATION"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" + frame_value = row["FRAME"] if frame_value not in unique_columns_rings_grouped: unique_columns_rings_grouped[frame_value] = {} - if row["INTERACTION"] != "skip": + if interaction != "skip": unique_columns_rings_grouped[frame_value][index] = col_name - # Add the column name and its value to the dictionary unique_columns_rings[index] = col_name - if self.peptide is not None: + + else: # If peptide is not None for index, row in df.iterrows(): - # Check if the 'INTERACTION' is 'hydrophobic' - if row["INTERACTION"] == "hydrophobic": - # Get the values from the current row - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - ring_found = False + interaction = row["INTERACTION"] + prot_partner = row["Prot_partner"] + peptide_partner = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" + + if interaction == "hydrophobic": col_name = f"{prot_partner}_{peptide_partner}_{interaction}" - elif row["INTERACTION"] == "hbond": - if row["PROTISDON"] == True: - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - type = "Acceptor" - elif row["PROTISDON"] == False: - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - type = "Donor" - # Concatenate the values to form a unique column name - col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}" - elif row["INTERACTION"] == "halogen": - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + elif interaction == "hbond": + if row["PROTISDON"]: + type_ = "Acceptor" + else: + type_ = "Donor" + col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" + elif interaction == "halogen": halogen = row["DONORTYPE"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name - col_name = ( - f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" - ) - elif row["INTERACTION"] == "waterbridge": - if row["PROTISDON"] == True: - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - type = "Acceptor" - # Concatenate the values to form a unique column name - col_name = ( - f"{prot_partner}_{peptide_partner}_{type}_{interaction}" - ) - elif row["PROTISDON"] == False: - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - type = "Donor" - # Concatenate the values to form a unique column name - col_name = ( - f"{prot_partner}_{peptide_partner}_{type}_{interaction}" - ) - elif row["INTERACTION"] == "pistacking": - prot_partner = row["Prot_partner"] - peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name + col_name = f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" + elif interaction == "waterbridge": + if row["PROTISDON"]: + type_ = "Acceptor" + else: + type_ = "Donor" + col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" + elif interaction == "pistacking": col_name = f"{prot_partner}_{peptide_partner}_{interaction}" - elif row["INTERACTION"] == "pication": - prot_partner = row["Prot_partner"] - ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + elif interaction == "pication": + ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" ligtype = row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") - elif row["INTERACTION"] == "saltbridge": - prot_partner = row["Prot_partner"] - ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + elif interaction == "saltbridge": + ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" lig_group = row["RESTYPE_LIG"] - interaction = row["INTERACTION"] - if row["PROTISPOS"] == True: - type = "NI" - # Concatenate the values to form a unique column name - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" - ) - elif row["PROTISPOS"] == False: - type = "PI" - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" - ) - elif row["INTERACTION"] == "metal": + if row["PROTISPOS"]: + type_ = "NI" + else: + type_ = "PI" + col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + elif interaction == "metal": special_ligand = row["RESTYPE_LIG"] - ligcarbonidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" metal_type = row["METAL_TYPE"] coordination = row["COORDINATION"] - interaction = row["INTERACTION"] - # Concatenate the values to form a unique column name col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" + frame_value = row["FRAME"] if frame_value not in unique_columns_rings_grouped: unique_columns_rings_grouped[frame_value] = {} - if row["INTERACTION"] != "skip": + if interaction != "skip": unique_columns_rings_grouped[frame_value][index] = col_name - # Add the column name and its value to the dictionary unique_columns_rings[index] = col_name - print("\033[1minteraction partners generated\033[0m") + print("\033[1minteraction partners generated\033[0m") return unique_columns_rings_grouped - def remove_duplicate_values(self, data): + def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int, Dict[int, str]]: """Remove the duplicate values from sub-dictionaries within the input dictionary. Args: @@ -280,7 +204,7 @@ def remove_duplicate_values(self, data): return unique_data - def combine_subdict_values(self, data): + def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, List[str]]: """Combines the values from the individual sub-dictionaries into a single list. Args: @@ -295,52 +219,39 @@ def combine_subdict_values(self, data): return combined_data - def filtering_values(self, threshold, df): + def filtering_values( + self, threshold: float, df: pd.DataFrame + ) -> List[str]: """Filter and append values (interactions) to a DataFrame based on occurrence counts. Args: - threshold (float): A treshold value that is used for filtering of the values (interactions) based upon the occurence count. - df (pandas dataframe): DataFrame to which the filtered values (interactions) will be added. - unique_columns_rings_grouped (dict): Dictionary containing the grouped and unique values otained from gather_interactions. + threshold (float): A threshold value used for filtering the values (interactions) based on occurrence count. + df (pandas.DataFrame): DataFrame to which the filtered values (interactions) will be added. Returns: - list: A list of values, with unique values and their corresponding occurence counts. + list: A list of values, with unique values and their corresponding occurrence counts. """ - frames = len(self.pdb_md.trajectory) - 1 - # Call the function to remove duplicate keys unique_data = self.remove_duplicate_values(self.unique_columns_rings_grouped) - - # Call the function to combine sub-dictionary values unique_colums_rings_all = self.combine_subdict_values(unique_data) - - # Flatten the list of values all_values = unique_colums_rings_all["all"] - # Count the occurrences of each value occurrences = {} for value in all_values: - if value in occurrences: - occurrences[value] += 1 - else: - occurrences[value] = 1 + occurrences[value] = occurrences.get(value, 0) + 1 - # Calculate the threshold (20% of 1000) threshold = threshold * frames - - # Filter out values that appear in less than 20% of 1000 filtered_values = [ value for value, count in occurrences.items() if count >= threshold ] - # Append the filtered values as columns to the DataFrame for value in filtered_values: df[value] = None return filtered_values - def unique_data_generation(self, filtered_values): - """Generate a dictionary conataing the unique interactions from a list of filtered values obtained by filtering_values. + def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]: + """Generate a dictionary containing the unique interactions from a list of filtered values obtained by filtering_values. Args: filtered_values (list): A list of values, where the unique interactions are extracted from. @@ -348,25 +259,19 @@ def unique_data_generation(self, filtered_values): Returns: dict: A dictionary containing the filtered unique interactions. """ - # Create a new dictionary to store unique values unique_data = {} - - # Loop through the elements of the data_list for element in filtered_values: - # Check if the element is not already in the unique_data dictionary keys if element not in unique_data: - # If not, add the element as both key and value unique_data[element] = element return unique_data - def df_iteration_numbering(self, df, unique_data): + def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) -> None: """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present. Args: - df (pandas dataframe): DataFrame which has the interaction data for all of the frames. - unique_data (dict): Dictionary that contains the unique interactions obtained from unique_data_generation. - peptide (str, optional): name of the peptide chainid in the original topology. Defaults to None. + df (pd.DataFrame): DataFrame which has the interaction data for all of the frames. + unique_data (Dict[str, str]): Dictionary that contains the unique interactions obtained from unique_data_generation. """ if self.peptide is None: for index, row in df.iterrows(): @@ -382,13 +287,11 @@ def df_iteration_numbering(self, df, unique_data): row["INTERACTION"] == interaction ) df.at[index, col] = 1 if condition else 0 - else: - continue elif row["INTERACTION"] == "hbond": if row["PROTISDON"] == True: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = ( + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) ligcarbonidx = int(ligcarbonidx) @@ -401,7 +304,7 @@ def df_iteration_numbering(self, df, unique_data): elif row["PROTISDON"] == False: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = ( + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) ligcarbonidx = int(ligcarbonidx) @@ -438,8 +341,8 @@ def df_iteration_numbering(self, df, unique_data): elif row["INTERACTION"] == "waterbridge": for col in unique_data.values(): if "waterbridge" in col: - if row["PROTISDON"] == True: - prot_partner, ligcarbonidx, type, interaction = ( + if row["PROTISDON"]: + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) condition = ( @@ -448,8 +351,8 @@ def df_iteration_numbering(self, df, unique_data): & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["PROTISDON"] == False: - prot_partner, ligcarbonidx, type, interaction = ( + else: + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) condition = ( @@ -482,7 +385,7 @@ def df_iteration_numbering(self, df, unique_data): ligidx = parts[1:-3] ligidx = ",".join(ligidx) lig_group = parts[-3] - type = parts[-2] + _type = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) @@ -506,7 +409,8 @@ def df_iteration_numbering(self, df, unique_data): & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - if self.peptide is not None: + + else: # If self.peptide is not None for index, row in df.iterrows(): if row["INTERACTION"] == "hydrophobic": for col in unique_data.values(): @@ -520,13 +424,11 @@ def df_iteration_numbering(self, df, unique_data): row["INTERACTION"] == interaction ) df.at[index, col] = 1 if condition else 0 - else: - continue elif row["INTERACTION"] == "hbond": - if row["PROTISDON"] == True: + if row["PROTISDON"]: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = ( + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) condition = ( @@ -538,10 +440,10 @@ def df_iteration_numbering(self, df, unique_data): & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["PROTISDON"] == False: + else: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = ( + prot_partner, ligcarbonidx, _type, interaction = ( col.split("_") ) condition = ( @@ -585,46 +487,31 @@ def df_iteration_numbering(self, df, unique_data): elif row["INTERACTION"] == "waterbridge": for col in unique_data.values(): if "waterbridge" in col: - if row["PROTISDON"] == True: - prot_partner, ligcarbonidx, type, interaction = ( - col.split("_") - ) - condition = ( - (row["Prot_partner"] == prot_partner) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligcarbonidx - ) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["PROTISDON"] == False: - prot_partner, ligcarbonidx, type, interaction = ( - col.split("_") - ) - condition = ( - (row["Prot_partner"] == prot_partner) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligcarbonidx - ) - & (row["INTERACTION"] == interaction) + prot_partner, ligcarbonidx, _type, interaction = ( + col.split("_") + ) + condition = ( + (row["Prot_partner"] == prot_partner) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligcarbonidx ) - df.at[index, col] = 1 if condition else 0 + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 elif row["INTERACTION"] == "pication": for col in unique_data.values(): if "pication" in col: parts = col.split("_") prot_partner = parts[0] - ligidx = parts[1] + ligidx = parts[1:-2] + ligidx = ",".join(ligidx) ligtype = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligidx - ) + & (row["RESNR_LIG"] == ligidx) + & (row["RESTYPE_LIG"] == ligtype) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 @@ -633,16 +520,15 @@ def df_iteration_numbering(self, df, unique_data): if "saltbridge" in col: parts = col.split("_") prot_partner = parts[0] - ligidx = parts[1] + ligidx = parts[1:-3] + ligidx = ",".join(ligidx) lig_group = parts[-3] - type = parts[-2] + _type = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligidx - ) + & (row["RESNR_LIG"] == ligidx) + & (row["RESTYPE_LIG"] == lig_group) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 @@ -651,29 +537,29 @@ def df_iteration_numbering(self, df, unique_data): if "metal" in col: parts = col.split("_") special_ligand = parts[0] - ligidx = parts[1] + ligidx = int(parts[1]) metal_type = parts[2] interaction = parts[-1] condition = ( (row["RESTYPE_LIG"] == special_ligand) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligidx - ) + & (int(row["TARGET_IDX"]) == ligidx) & (row["METAL_TYPE"] == metal_type) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - def update_values(self, df, new, unique_data): - """Update the values in the input DataFrame based upon the frame values and an reference DataFrame. + from typing import Dict +import pandas as pd - Args: - df (pandas dataframe): Input DataFrame that will be updated. - new (pandas dataframe): The reference DataFrame containing values that are used to update the input DataFrame. - unique_data (dict): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame. - """ - for idx, row in df.iterrows(): - frame_value = row["FRAME"] - values_to_update = new.loc[frame_value, list(unique_data.values())] - df.loc[idx, list(unique_data.values())] = values_to_update +def update_values(self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]) -> None: + """Update the values in the input DataFrame based upon the frame values and a reference DataFrame. + + Args: + df (pandas.DataFrame): Input DataFrame that will be updated. + new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame. + unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame. + """ + for idx, row in df.iterrows(): + frame_value = row["FRAME"] + values_to_update = new.loc[frame_value, list(unique_data.values())] + df.loc[idx, list(unique_data.values())] = values_to_update From 6ff92f5ebddef8abe1720b27209fcd0e2cd79bd0 Mon Sep 17 00:00:00 2001 From: ARMI Date: Mon, 19 Aug 2024 15:55:09 +0200 Subject: [PATCH 11/23] Update interaction_gathering.py added type checking and hinting --- .../interaction_gathering.py | 268 +++++++++++------- 1 file changed, 160 insertions(+), 108 deletions(-) diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py index 30e697be..c36ba09b 100644 --- a/openmmdl/openmmdl_analysis/interaction_gathering.py +++ b/openmmdl/openmmdl_analysis/interaction_gathering.py @@ -6,27 +6,29 @@ from plip.structure.preparation import PDBComplex, PLInteraction from plip.exchange.report import BindingSiteReport from multiprocessing import Pool -from typing import Optional, List, Dict, Tuple +from functools import partial +from typing import List, Dict, Tuple, Optional, Union config.KEEPMOD = True + class InteractionAnalyzer: def __init__( - self, - pdb_md: mda.Universe, - dataframe: Optional[str], - num_processes: int, - lig_name: str, - special_ligand: Optional[str], + self, + pdb_md: mda.Universe, + dataframe: Optional[str], + num_processes: int, + lig_name: str, + special_ligand: Optional[str], peptide: Optional[str] - ): + ) -> None: self.pdb_md = pdb_md self.dataframe = dataframe self.num_processes = num_processes self.lig_name = lig_name self.special = special_ligand self.peptide = peptide - self.ineraction_list = self.process_trajectory() + self.interaction_list = self.process_trajectory() def characterize_complex( self, pdb_file: str, binding_site_id: str @@ -38,7 +40,7 @@ def characterize_complex( binding_site_id (str): Identifier of the binding site. Returns: - Optional[PLInteraction]: Interaction object or None if the binding site is not found. + Optional[PLInteraction]: Interactions if found; otherwise, None. """ pdb_complex = PDBComplex() pdb_complex.load_pdb(pdb_file) @@ -48,28 +50,28 @@ def characterize_complex( == binding_site_id ): pdb_complex.characterize_complex(ligand) - return pdb_complex.interaction_sets.get(binding_site_id) + return pdb_complex.interaction_sets[binding_site_id] + return None def retrieve_plip_interactions( self, pdb_file: str, lig_name: str ) -> Dict[str, Dict[str, List]]: - """Retrieves the interactions from PLIP. + """Retrieve interactions from PLIP. Args: - pdb_file (str): Path of the PDB file of the complex. - lig_name (str): Name of the ligand in the complex. + pdb_file (str): Path to the PDB file of the complex. + lig_name (str): Name of the ligand in the complex topology. Returns: - Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions. + Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions. """ protlig = PDBComplex() protlig.load_pdb(pdb_file) for ligand in protlig.ligands: if str(ligand.longname) == lig_name: protlig.characterize_complex(ligand) - - sites = {} + sites: Dict[str, Dict[str, List]] = {} for key, site in sorted(protlig.interaction_sets.items()): binding_site = BindingSiteReport(site) keys = ( @@ -94,19 +96,18 @@ def retrieve_plip_interactions( def retrieve_plip_interactions_peptide( self, pdb_file: str ) -> Dict[str, Dict[str, List]]: - """Retrieves the interactions from PLIP for a peptide. + """Retrieve interactions from PLIP for a peptide. Args: - pdb_file (str): Path of the PDB file of the complex. + pdb_file (str): Path to the PDB file of the complex. Returns: - Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions. + Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions. """ protlig = PDBComplex() protlig.load_pdb(pdb_file) protlig.characterize_complex(protlig.ligands[-1]) - - sites = {} + sites: Dict[str, Dict[str, List]] = {} for key, site in sorted(protlig.interaction_sets.items()): binding_site = BindingSiteReport(site) keys = ( @@ -131,16 +132,16 @@ def retrieve_plip_interactions_peptide( def create_df_from_binding_site( self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond" ) -> pd.DataFrame: - """Creates a DataFrame from a binding site and interaction type. + """Create a DataFrame from a binding site and interaction type. Args: - selected_site_interactions (Dict[str, List]): Pre-calculated interactions from PLIP. - interaction_type (str, optional): The interaction type of interest. Defaults to "hbond". + selected_site_interactions (Dict[str, List]): Precalculated interactions from PLIP for the selected site. + interaction_type (str, optional): Interaction type of interest (default is "hbond"). Returns: - pd.DataFrame: DataFrame with information retrieved from PLIP. + pd.DataFrame: DataFrame with information from PLIP. """ - valid_types = ( + valid_types = [ "hydrophobic", "hbond", "waterbridge", @@ -149,7 +150,7 @@ def create_df_from_binding_site( "pication", "halogen", "metal", - ) + ] if interaction_type not in valid_types: print( @@ -166,11 +167,11 @@ def create_df_from_binding_site( def change_lig_to_residue( self, file_path: str, new_residue_name: str ) -> None: - """Reformats the topology file to change the ligand to a residue. + """Reformat the topology file to change the ligand to a residue. Args: - file_path (str): Filepath of the topology file. - new_residue_name (str): New residue name of the ligand. + file_path (str): Path to the topology file. + new_residue_name (str): New residue name. """ with open(file_path, "r") as file: lines = file.readlines() @@ -189,21 +190,64 @@ def change_lig_to_residue( else: file.write(line) + def process_frame_special(self, frame: int) -> pd.DataFrame: + """Process a single frame of MD simulation for special ligands. + + Args: + frame (int): Frame number to be processed. + + Returns: + pd.DataFrame: DataFrame containing interaction data for the processed frame. + """ + atoms_selected = self.pdb_md.select_atoms( + f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" + ) + for num in self.pdb_md.trajectory[(frame):(frame + 1)]: + atoms_selected.write(f"processing_frame_{frame}.pdb") + interaction_list = pd.DataFrame() + interactions_by_site = self.retrieve_plip_interactions( + f"processing_frame_{frame}.pdb", self.lig_name + ) + index_of_selected_site = -1 + selected_site = list(interactions_by_site.keys())[index_of_selected_site] + + interaction_types = [ + "hydrophobic", + "hbond", + "waterbridge", + "saltbridge", + "pistacking", + "pication", + "halogen", + "metal", + ] + + for interaction_type in interaction_types: + tmp_interaction = self.create_df_from_binding_site( + interactions_by_site[selected_site], + interaction_type=interaction_type, + ) + tmp_interaction["frame"] = frame + tmp_interaction["interaction_type"] = interaction_type + interaction_list = pd.concat([interaction_list, tmp_interaction]) + + os.remove(f"processing_frame_{frame}.pdb") + return interaction_list + def process_frame(self, frame: int) -> pd.DataFrame: """Process a single frame of MD simulation. Args: - frame (int): The number of the frame that will be processed. + frame (int): Frame number to be processed. Returns: - pd.DataFrame: DataFrame containing the interaction data for the processed frame. + pd.DataFrame: DataFrame containing interaction data for the processed frame. """ atoms_selected = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" ) for num in self.pdb_md.trajectory[(frame):(frame + 1)]: atoms_selected.write(f"processing_frame_{frame}.pdb") - interaction_list = pd.DataFrame() if self.peptide is None: interactions_by_site = self.retrieve_plip_interactions( @@ -228,90 +272,98 @@ def process_frame(self, frame: int) -> pd.DataFrame: interactions_by_site[selected_site], interaction_type=interaction_type, ) - tmp_interaction["FRAME"] = int(frame) - tmp_interaction["INTERACTION"] = interaction_type + tmp_interaction["frame"] = frame + tmp_interaction["interaction_type"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) - - if os.path.exists(f"processing_frame_{frame}.pdb"): - os.remove(f"processing_frame_{frame}.pdb") - - if self.special is not None: - combi_lig_special = mda.Universe("ligand_special.pdb") - complex = mda.Universe("complex.pdb") - complex_all = complex.select_atoms("all") - result = self.process_frame_special(frame) - results_df = pd.concat(result, ignore_index=True) - results_df = results_df[results_df["LOCATION"] == "protein.sidechain"] - results_df["RESTYPE"] = results_df["RESTYPE"].replace( - ["HIS", "SER", "CYS"], self.lig_name - ) - results_df["LOCATION"] = results_df["LOCATION"].replace( - "protein.sidechain", "ligand" + else: + interactions_by_site = self.retrieve_plip_interactions_peptide( + f"processing_frame_{frame}.pdb" + ) + selected_site = list(interactions_by_site.keys())[0] + interaction_types = [ + "hydrophobic", + "hbond", + "waterbridge", + "saltbridge", + "pistacking", + "pication", + "halogen", + "metal", + ] + for interaction_type in interaction_types: + tmp_interaction = self.create_df_from_binding_site( + interactions_by_site[selected_site], + interaction_type=interaction_type, ) - updated_target_idx = [] - - for index, row in results_df.iterrows(): - ligand_special_int_nr = int(row["TARGET_IDX"]) - ligand_special_int_nr_atom = combi_lig_special.select_atoms( - f"id {ligand_special_int_nr}" - ) - for atom in ligand_special_int_nr_atom: - atom_name = atom.name - if atom_name in ["N", "C", "O", "S"]: - atom_name = f"{atom_name}1" - else: - base_name, atom_number = atom_name[:-1], int(atom_name[-1]) - new_atom_number = atom_number + 1 - atom_name = f"{base_name}{new_atom_number}" - for complex_atom in complex_all: - complex_atom_name = complex_atom.name - if complex_atom_name == atom_name: - updated_target_idx.append(row["TARGET_IDX"]) - break - - results_df["TARGET_IDX"] = updated_target_idx - results_df = results_df[ - (results_df["INTERACTION"] == "hbond") & (results_df["TARGET_IDX"].notna()) - ] - interaction_list = pd.concat([interaction_list, results_df]) - + tmp_interaction["frame"] = frame + tmp_interaction["interaction_type"] = interaction_type + interaction_list = pd.concat([interaction_list, tmp_interaction]) + + os.remove(f"processing_frame_{frame}.pdb") return interaction_list - def process_trajectory(self) -> pd.DataFrame: - """Process the entire trajectory of the MD simulation. + def process_frame_wrapper(self, frame: int) -> pd.DataFrame: + """Wrapper function to handle processing of a frame, including special cases. + + Args: + frame (int): Frame number to be processed. Returns: - pd.DataFrame: DataFrame containing interaction data across all frames. + pd.DataFrame: DataFrame containing interaction data for the processed frame. """ - total_frames = len(self.pdb_md.trajectory) - frame_range = range(total_frames) - with Pool(processes=self.num_processes) as pool: - results = list(tqdm(pool.imap(self.process_frame, frame_range), total_frames)) - - all_interactions = pd.concat(results, ignore_index=True) - return all_interactions + if self.special: + return self.process_frame_special(frame) + return self.process_frame(frame) - def process_frame_special(self, frame: int) -> List[pd.DataFrame]: - """Placeholder method for processing special frame interactions. + def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame: + """Fill in missing frames in the DataFrame with NaNs. Args: - frame (int): The number of the frame to process. + df (pd.DataFrame): Original DataFrame with potentially missing frames. Returns: - List[pd.DataFrame]: List of DataFrames with interaction data. + pd.DataFrame: DataFrame with missing frames filled. """ - # Implement specific frame processing for special cases if needed. - return [] - -# Example of usage: -# pdb_md = mda.Universe('your_pdb_file.pdb') -# analyzer = InteractionAnalyzer( -# pdb_md=pdb_md, -# dataframe='your_dataframe.csv', -# num_processes=4, -# lig_name='LIG', -# special_ligand='SPL', -# peptide='PEP' -# ) -# df = analyzer.ineraction_list -# print(df) + all_frames = set(range(len(self.pdb_md.trajectory))) + existing_frames = set(df["frame"].unique()) + missing_frames = all_frames - existing_frames + missing_data = [] + + for frame in missing_frames: + for interaction_type in df["interaction_type"].unique(): + missing_data.append({ + "frame": frame, + "interaction_type": interaction_type, + # Add other necessary columns with NaNs or empty values as appropriate + }) + + missing_df = pd.DataFrame(missing_data) + filled_df = pd.concat([df, missing_df], ignore_index=True) + return filled_df + + def process_trajectory(self) -> pd.DataFrame: + """Main method for processing the trajectory. + + Returns: + pd.DataFrame: A DataFrame containing all interaction data for the trajectory. + """ + pool = Pool(processes=self.num_processes) + with tqdm(total=len(self.pdb_md.trajectory)) as pbar: + interaction_list = pd.concat( + list( + tqdm( + pool.imap(self.process_frame_wrapper, range(len(self.pdb_md.trajectory))), + total=len(self.pdb_md.trajectory), + ) + ) + ) + pool.close() + pool.join() + + if self.dataframe: + interaction_list.to_csv(self.dataframe) + + # Ensure all frames are represented in the final DataFrame + interaction_list = self.fill_missing_frames(interaction_list) + + return interaction_list From 59c621296026e3a5a65262a604bc116c85e37abf Mon Sep 17 00:00:00 2001 From: ARMI Date: Thu, 22 Aug 2024 10:59:17 +0200 Subject: [PATCH 12/23] Add files via upload --- .../openmmdl_analysis/barcode_generation.py | 2 +- .../binding_mode_processing.py | 67 ++++---- .../openmmdl_analysis/find_stable_waters.py | 25 +-- .../interaction_gathering.py | 51 +++--- .../markov_state_figure_generation.py | 145 +++++++++++++----- .../openmmdl_analysis/openmmdlanalysis.py | 2 +- openmmdl/openmmdl_analysis/pharmacophore.py | 33 ++-- openmmdl/openmmdl_analysis/preprocessing.py | 44 +++++- .../rdkit_figure_generation.py | 76 +++++---- .../openmmdl_analysis/rmsd_calculation.py | 4 +- .../openmmdl_analysis/visualization.ipynb | 10 +- .../visualization_functions.py | 29 +++- 12 files changed, 335 insertions(+), 153 deletions(-) diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py index 2ec99d02..ee804f9b 100644 --- a/openmmdl/openmmdl_analysis/barcode_generation.py +++ b/openmmdl/openmmdl_analysis/barcode_generation.py @@ -187,7 +187,7 @@ def plot_waterbridge_piechart( self, waterbridge_barcodes: Dict[str, np.ndarray], waterbridge_interactions: List[str], - fig_type: str + fig_type: str, ) -> None: """ Plots pie charts for waterbridge interactions and saves them to files. diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py index 0d326b19..135694d0 100644 --- a/openmmdl/openmmdl_analysis/binding_mode_processing.py +++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py @@ -10,16 +10,16 @@ from typing import Dict, List, Union, Optional, Tuple -class BindingModeProcessor: +class BindingModeProcesser: def __init__( self, - pdb_md: 'MDAnalysis.Universe', # Placeholder for the actual type + pdb_md, ligand: str, peptide: Optional[str], special: str, ligand_rings: List[List[int]], interaction_list: pd.DataFrame, - threshold: float + threshold: float, ) -> None: self.pdb_md = pdb_md self.ligand = ligand @@ -65,11 +65,11 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: ring_found = False for ligand_ring in self.ligand_rings: if ligcarbonidx in ligand_ring: - numbers_as_strings = [ - str(idx) for idx in ligand_ring - ] + numbers_as_strings = [str(idx) for idx in ligand_ring] name_with_numbers = "_".join(numbers_as_strings) - col_name = f"{prot_partner}_{name_with_numbers}_{interaction}" + col_name = ( + f"{prot_partner}_{name_with_numbers}_{interaction}" + ) ring_found = True break if not ring_found: @@ -109,7 +109,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: type_ = "NI" else: type_ = "PI" - col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + ) elif interaction == "metal": special_ligand = row["RESTYPE_LIG"] ligcarbonidx = int(row["TARGET_IDX"]) @@ -140,7 +142,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" elif interaction == "halogen": halogen = row["DONORTYPE"] - col_name = f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" + col_name = ( + f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" + ) elif interaction == "waterbridge": if row["PROTISDON"]: type_ = "Acceptor" @@ -161,7 +165,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: type_ = "NI" else: type_ = "PI" - col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + ) elif interaction == "metal": special_ligand = row["RESTYPE_LIG"] ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" @@ -179,7 +185,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: print("\033[1minteraction partners generated\033[0m") return unique_columns_rings_grouped - def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int, Dict[int, str]]: + def remove_duplicate_values( + self, data: Dict[int, Dict[int, str]] + ) -> Dict[int, Dict[int, str]]: """Remove the duplicate values from sub-dictionaries within the input dictionary. Args: @@ -204,7 +212,9 @@ def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int, return unique_data - def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, List[str]]: + def combine_subdict_values( + self, data: Dict[int, Dict[int, str]] + ) -> Dict[str, List[str]]: """Combines the values from the individual sub-dictionaries into a single list. Args: @@ -219,9 +229,7 @@ def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, L return combined_data - def filtering_values( - self, threshold: float, df: pd.DataFrame - ) -> List[str]: + def filtering_values(self, threshold: float, df: pd.DataFrame) -> List[str]: """Filter and append values (interactions) to a DataFrame based on occurrence counts. Args: @@ -266,7 +274,9 @@ def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]: return unique_data - def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) -> None: + def df_iteration_numbering( + self, df: pd.DataFrame, unique_data: Dict[str, str] + ) -> None: """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present. Args: @@ -487,8 +497,8 @@ def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) elif row["INTERACTION"] == "waterbridge": for col in unique_data.values(): if "waterbridge" in col: - prot_partner, ligcarbonidx, _type, interaction = ( - col.split("_") + prot_partner, ligcarbonidx, _type, interaction = col.split( + "_" ) condition = ( (row["Prot_partner"] == prot_partner) @@ -548,18 +558,17 @@ def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) ) df.at[index, col] = 1 if condition else 0 - from typing import Dict -import pandas as pd - -def update_values(self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]) -> None: - """Update the values in the input DataFrame based upon the frame values and a reference DataFrame. + def update_values( + self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str] + ) -> None: + """Update the values in the input DataFrame based upon the frame values and a reference DataFrame. - Args: + Args: df (pandas.DataFrame): Input DataFrame that will be updated. new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame. unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame. - """ - for idx, row in df.iterrows(): - frame_value = row["FRAME"] - values_to_update = new.loc[frame_value, list(unique_data.values())] - df.loc[idx, list(unique_data.values())] = values_to_update + """ + for idx, row in df.iterrows(): + frame_value = row["FRAME"] + values_to_update = new.loc[frame_value, list(unique_data.values())] + df.loc[idx, list(unique_data.values())] = values_to_update diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index 88e6ccb2..390d4fc1 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -8,6 +8,7 @@ from Bio.PDB import PDBParser, Structure from typing import Tuple, Dict, List, Optional + class StableWaters: def __init__(self, trajectory: str, topology: str, water_eps: float) -> None: self.trajectory = trajectory @@ -49,9 +50,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: stable_coords = [] for atom_index, coords in frame_coords.items(): - prev_coords = prev_frame_coords.get( - atom_index, coords - ) + prev_coords = prev_frame_coords.get(atom_index, coords) distance = np.linalg.norm(np.array(coords) - np.array(prev_coords)) @@ -85,7 +84,11 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: return stable_waters, total_frames def perform_clustering_and_writing( - self, stable_waters: pd.DataFrame, cluster_eps: float, total_frames: int, output_directory: str + self, + stable_waters: pd.DataFrame, + cluster_eps: float, + total_frames: int, + output_directory: str, ) -> None: """ Perform DBSCAN clustering on the stable water coordinates, and write the clusters and their representatives to PDB files. @@ -121,7 +124,10 @@ def perform_clustering_and_writing( ) def write_pdb_clusters_and_representatives( - self, clustered_waters: pd.DataFrame, min_samples: int, output_sub_directory: str + self, + clustered_waters: pd.DataFrame, + min_samples: int, + output_sub_directory: str, ) -> None: """ Writes the clusters and their representatives to PDB files. @@ -136,9 +142,7 @@ def write_pdb_clusters_and_representatives( print("minsamples:") print(min_samples) os.makedirs(output_sub_directory, exist_ok=True) - with pd.option_context( - "display.max_rows", None - ): + with pd.option_context("display.max_rows", None): for label, cluster in clustered_waters.groupby("Cluster_Label"): pdb_lines = [] for _, row in cluster.iterrows(): @@ -210,7 +214,10 @@ def filter_and_parse_pdb(self, protein_pdb: str) -> Structure: return structure def find_interacting_residues( - self, structure: Structure, representative_waters: pd.DataFrame, distance_threshold: float + self, + structure: Structure, + representative_waters: pd.DataFrame, + distance_threshold: float, ) -> Dict[int, List[Tuple[str, int]]]: """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. Args: diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py index c36ba09b..08d0d8f3 100644 --- a/openmmdl/openmmdl_analysis/interaction_gathering.py +++ b/openmmdl/openmmdl_analysis/interaction_gathering.py @@ -14,13 +14,13 @@ class InteractionAnalyzer: def __init__( - self, - pdb_md: mda.Universe, - dataframe: Optional[str], - num_processes: int, - lig_name: str, - special_ligand: Optional[str], - peptide: Optional[str] + self, + pdb_md: mda.Universe, + dataframe: Optional[str], + num_processes: int, + lig_name: str, + special_ligand: Optional[str], + peptide: Optional[str], ) -> None: self.pdb_md = pdb_md self.dataframe = dataframe @@ -130,7 +130,9 @@ def retrieve_plip_interactions_peptide( return sites def create_df_from_binding_site( - self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond" + self, + selected_site_interactions: Dict[str, List], + interaction_type: str = "hbond", ) -> pd.DataFrame: """Create a DataFrame from a binding site and interaction type. @@ -164,9 +166,7 @@ def create_df_from_binding_site( ) return df - def change_lig_to_residue( - self, file_path: str, new_residue_name: str - ) -> None: + def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None: """Reformat the topology file to change the ligand to a residue. Args: @@ -202,7 +202,7 @@ def process_frame_special(self, frame: int) -> pd.DataFrame: atoms_selected = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" ) - for num in self.pdb_md.trajectory[(frame):(frame + 1)]: + for num in self.pdb_md.trajectory[(frame) : (frame + 1)]: atoms_selected.write(f"processing_frame_{frame}.pdb") interaction_list = pd.DataFrame() interactions_by_site = self.retrieve_plip_interactions( @@ -230,7 +230,7 @@ def process_frame_special(self, frame: int) -> pd.DataFrame: tmp_interaction["frame"] = frame tmp_interaction["interaction_type"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) - + os.remove(f"processing_frame_{frame}.pdb") return interaction_list @@ -246,7 +246,7 @@ def process_frame(self, frame: int) -> pd.DataFrame: atoms_selected = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" ) - for num in self.pdb_md.trajectory[(frame):(frame + 1)]: + for num in self.pdb_md.trajectory[(frame) : (frame + 1)]: atoms_selected.write(f"processing_frame_{frame}.pdb") interaction_list = pd.DataFrame() if self.peptide is None: @@ -298,7 +298,7 @@ def process_frame(self, frame: int) -> pd.DataFrame: tmp_interaction["frame"] = frame tmp_interaction["interaction_type"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) - + os.remove(f"processing_frame_{frame}.pdb") return interaction_list @@ -331,11 +331,13 @@ def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame: for frame in missing_frames: for interaction_type in df["interaction_type"].unique(): - missing_data.append({ - "frame": frame, - "interaction_type": interaction_type, - # Add other necessary columns with NaNs or empty values as appropriate - }) + missing_data.append( + { + "frame": frame, + "interaction_type": interaction_type, + # Add other necessary columns with NaNs or empty values as appropriate + } + ) missing_df = pd.DataFrame(missing_data) filled_df = pd.concat([df, missing_df], ignore_index=True) @@ -352,7 +354,10 @@ def process_trajectory(self) -> pd.DataFrame: interaction_list = pd.concat( list( tqdm( - pool.imap(self.process_frame_wrapper, range(len(self.pdb_md.trajectory))), + pool.imap( + self.process_frame_wrapper, + range(len(self.pdb_md.trajectory)), + ), total=len(self.pdb_md.trajectory), ) ) @@ -362,8 +367,8 @@ def process_trajectory(self) -> pd.DataFrame: if self.dataframe: interaction_list.to_csv(self.dataframe) - + # Ensure all frames are represented in the final DataFrame interaction_list = self.fill_missing_frames(interaction_list) - + return interaction_list diff --git a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py index 7e40c020..220acff8 100644 --- a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py +++ b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py @@ -4,6 +4,7 @@ import os from typing import Dict, List, Tuple, Union + class MarkovChainAnalysis: def __init__(self, min_transition: float): self.min_transition = min_transition @@ -29,7 +30,7 @@ def generate_transition_graph( combined_dict: Dict[str, List[str]], fig_type: str = "png", font_size: int = 36, - size_node: int = 200 + size_node: int = 200, ) -> None: """Generate Markov Chain plots based on transition probabilities. @@ -49,8 +50,10 @@ def generate_transition_graph( part1_length: int = part_length + remaining_length part2_length: int = part_length part1_data: List[str] = combined_dict["all"][:part1_length] - part2_data: List[str] = combined_dict["all"][part1_length: part1_length + part2_length] - part3_data: List[str] = combined_dict["all"][part1_length + part2_length:] + part2_data: List[str] = combined_dict["all"][ + part1_length : part1_length + part2_length + ] + part3_data: List[str] = combined_dict["all"][part1_length + part2_length :] # Count the occurrences of each node in each part part1_node_occurrences: Dict[str, int] = { @@ -83,10 +86,14 @@ def generate_transition_graph( node_occurrences: Dict[str, int] = { node: combined_dict["all"].count(node) for node in set(combined_dict["all"]) } - top_10_nodes: List[str] = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10] + top_10_nodes: List[str] = sorted( + node_occurrences, key=node_occurrences.get, reverse=True + )[:10] for min_transition_percent in self.min_transitions: - min_prob: float = min_transition_percent / 100 # Convert percentage to probability + min_prob: float = ( + min_transition_percent / 100 + ) # Convert percentage to probability # Create a directed graph G: nx.DiGraph = nx.DiGraph() @@ -108,19 +115,32 @@ def generate_transition_graph( # Add edges to the graph with their probabilities for transition, count in transitions.items(): current_state, next_state = transition - probability: float = (count / len(combined_dict["all"])) * 100 # Convert probability to percentage + probability: float = ( + count / len(combined_dict["all"]) + ) * 100 # Convert probability to percentage if probability >= min_transition_percent: G.add_edge(current_state, next_state, weight=probability) # Include the reverse transition with a different color reverse_transition: Tuple[str, str] = (next_state, current_state) - reverse_count: int = transitions.get(reverse_transition, 0) # Use the correct count for the reverse transition - reverse_probability: float = (reverse_count / len(combined_dict["all"])) * 100 - G.add_edge(next_state, current_state, weight=reverse_probability, reverse=True) + reverse_count: int = transitions.get( + reverse_transition, 0 + ) # Use the correct count for the reverse transition + reverse_probability: float = ( + reverse_count / len(combined_dict["all"]) + ) * 100 + G.add_edge( + next_state, + current_state, + weight=reverse_probability, + reverse=True, + ) # Add self-loops to the graph with their probabilities for self_loop, count in self_loops.items(): state: str = self_loop[0] - probability: float = (count / len(combined_dict["all"])) * 100 # Convert probability to percentage + probability: float = ( + count / len(combined_dict["all"]) + ) * 100 # Convert probability to percentage if probability >= min_transition_percent: G.add_edge(state, state, weight=probability) @@ -139,11 +159,19 @@ def generate_transition_graph( forward_count: int = transitions.get(forward_transition, 0) backward_count: int = transitions.get(backward_transition, 0) - transition_probabilities_forward[forward_transition] = (forward_count / node_occurrences[start_state]) * 100 - transition_occurrences_forward[forward_transition] = (forward_count / len(combined_dict["all"])) * 100 + transition_probabilities_forward[forward_transition] = ( + forward_count / node_occurrences[start_state] + ) * 100 + transition_occurrences_forward[forward_transition] = ( + forward_count / len(combined_dict["all"]) + ) * 100 - transition_probabilities_backward[backward_transition] = (backward_count / node_occurrences[end_state]) * 100 - transition_occurrences_backward[backward_transition] = (backward_count / len(combined_dict["all"])) * 100 + transition_probabilities_backward[backward_transition] = ( + backward_count / node_occurrences[end_state] + ) * 100 + transition_occurrences_backward[backward_transition] = ( + backward_count / len(combined_dict["all"]) + ) * 100 # Calculate self-loop probabilities self_loop_probabilities: Dict[str, float] = {} @@ -155,10 +183,15 @@ def generate_transition_graph( # Generate the Markov Chain plot plt.figure(figsize=(60, 60)) # Increased figure size - plt.title(f"Markov Chain Plot {min_transition_percent}% Frames Transition", fontsize=72) + plt.title( + f"Markov Chain Plot {min_transition_percent}% Frames Transition", + fontsize=72, + ) # Draw nodes and edges - pos: Dict[str, Tuple[float, float]] = nx.spring_layout(G, k=2, seed=42) # Increased distance between nodes (k=2) + pos: Dict[str, Tuple[float, float]] = nx.spring_layout( + G, k=2, seed=42 + ) # Increased distance between nodes (k=2) edge_colors: List[str] = [] for u, v, data in G.edges(data=True): @@ -167,7 +200,9 @@ def generate_transition_graph( if u == v: # Check if it is a self-loop edge_colors.append("green") # Set green color for self-loop arrows width: float = 0.1 # Make self-loop arrows smaller - connection_style: str = "arc3,rad=-0.1" # Make the self-loops more curved and compact + connection_style: str = ( + "arc3,rad=-0.1" # Make the self-loops more curved and compact + ) nx.draw_networkx_edges( G, pos, @@ -178,13 +213,19 @@ def generate_transition_graph( connectionstyle=connection_style, ) elif weight >= min_transition_percent: - edge_colors.append("black") # Highlight significant transitions in black + edge_colors.append( + "black" + ) # Highlight significant transitions in black # Check if both nodes are present before adding labels if G.has_node(u) and G.has_node(v): width: float = 4.0 - forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" - backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + forward_label: str = ( + f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" + ) + backward_label: str = ( + f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + ) edge_label: str = f"{forward_label}\n{backward_label}" connection_style: str = "arc3,rad=-0.1" @@ -197,15 +238,23 @@ def generate_transition_graph( edge_color=edge_colors[-1], connectionstyle=connection_style, ) - nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=26) + nx.draw_networkx_edge_labels( + G, pos, edge_labels={(u, v): edge_label}, font_size=26 + ) else: - edge_colors.append("grey") # Use grey for non-significant transitions + edge_colors.append( + "grey" + ) # Use grey for non-significant transitions width: float = 0.5 # Check if both nodes are present before adding labels if G.has_node(u) and G.has_node(v): - forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" - backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + forward_label: str = ( + f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability" + ) + backward_label: str = ( + f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability" + ) edge_label: str = f"{forward_label}\n{backward_label}" connection_style: str = "arc3,rad=-0.1" @@ -218,15 +267,23 @@ def generate_transition_graph( edge_color=edge_colors[-1], connectionstyle=connection_style, ) - nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=36) + nx.draw_networkx_edge_labels( + G, pos, edge_labels={(u, v): edge_label}, font_size=36 + ) # Update the node colors based on their appearance percentages in each part node_colors: List[str] = [] for node in G.nodes(): if node in top_10_nodes: - part1_percentage: float = (part1_node_occurrences.get(node, 0) / node_occurrences[node]) - part2_percentage: float = (part2_node_occurrences.get(node, 0) / node_occurrences[node]) - part3_percentage: float = (part3_node_occurrences.get(node, 0) / node_occurrences[node]) + part1_percentage: float = ( + part1_node_occurrences.get(node, 0) / node_occurrences[node] + ) + part2_percentage: float = ( + part2_node_occurrences.get(node, 0) / node_occurrences[node] + ) + part3_percentage: float = ( + part3_node_occurrences.get(node, 0) / node_occurrences[node] + ) if part1_percentage > 0.5: node_colors.append("green") @@ -240,8 +297,12 @@ def generate_transition_graph( node_colors.append("skyblue") # Draw nodes with sizes correlated to occurrences and color top 10 nodes - node_size: List[int] = [size_node * node_occurrences[node] for node in G.nodes()] - nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_colors, alpha=0.8) + node_size: List[int] = [ + size_node * node_occurrences[node] for node in G.nodes() + ] + nx.draw_networkx_nodes( + G, pos, node_size=node_size, node_color=node_colors, alpha=0.8 + ) # Draw node labels with occurrence percentage and self-loop probability for nodes with edges node_labels: Dict[str, str] = {} @@ -261,10 +322,18 @@ def generate_transition_graph( if relevant_edges: if node in top_10_nodes: - node_occurrence_percentage: float = (node_occurrences[node] / len(combined_dict["all"])) * 100 - self_loop_probability: float = self_loop_probabilities.get(node, 0) * 100 - self_loop_occurence: float = self_loop_occurences.get(node, 0) - node_label: str = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%" + node_occurrence_percentage: float = ( + node_occurrences[node] / len(combined_dict["all"]) + ) * 100 + self_loop_probability: float = ( + self_loop_probabilities.get(node, 0) * 100 + ) + self_loop_occurence: float = self_loop_occurences.get( + node, 0 + ) + node_label: str = ( + f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%" + ) node_labels[node] = node_label else: node_labels[node] = node @@ -285,9 +354,13 @@ def generate_transition_graph( plt.tight_layout() # Save the plot - plot_filename: str = f"markov_chain_plot_{min_transition_percent}.{fig_type}" + plot_filename: str = ( + f"markov_chain_plot_{min_transition_percent}.{fig_type}" + ) plot_path: str = os.path.join("Binding_Modes_Markov_States", plot_filename) - os.makedirs("Binding_Modes_Markov_States", exist_ok=True) # Create the folder if it doesn't exist + os.makedirs( + "Binding_Modes_Markov_States", exist_ok=True + ) # Create the folder if it doesn't exist plt.savefig(plot_path, dpi=300) plt.clf() # Clear the current figure for the next iteration diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py index 0b5c3940..70286a4b 100644 --- a/openmmdl/openmmdl_analysis/openmmdlanalysis.py +++ b/openmmdl/openmmdl_analysis/openmmdlanalysis.py @@ -373,7 +373,7 @@ def main(): interaction_analysis = InteractionAnalyzer( pdb_md, dataframe, cpu_count, ligand, special_ligand, peptide ) - interaction_list = interaction_analysis.ineraction_list + interaction_list = interaction_analysis.interaction_list interaction_list.to_csv("missing_frames_filled.csv") interaction_list = interaction_list.reset_index(drop=True) diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py index fb09eacf..76d00263 100644 --- a/openmmdl/openmmdl_analysis/pharmacophore.py +++ b/openmmdl/openmmdl_analysis/pharmacophore.py @@ -4,6 +4,7 @@ import numpy as np from typing import Dict, List, Optional, Union + class PharmacophoreGenerator: def __init__(self, df_all: pd.DataFrame, ligand_name: str): self.df_all = df_all @@ -12,7 +13,9 @@ def __init__(self, df_all: pd.DataFrame, ligand_name: str): self.coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") self.clouds = self._generate_clouds() - def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: + def _generate_clouds( + self, + ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: interaction_coords = { "hydrophobic": [], "acceptor": [], @@ -57,7 +60,9 @@ def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[ return self._format_clouds(interaction_coords) - def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: + def _format_clouds( + self, interaction_coords: Dict[str, List[List[float]]] + ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: color_mapping = { "hydrophobic": [1.0, 1.0, 0.0], "acceptor": [1.0, 0.0, 0.0], @@ -79,10 +84,14 @@ def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Di for interaction, coords in interaction_coords.items() } - def to_dict(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: + def to_dict( + self, + ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]: return self.clouds - def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, List[float]]: + def generate_pharmacophore_centers( + self, interactions: List[str] + ) -> Dict[str, List[float]]: """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions. Args: @@ -113,7 +122,9 @@ def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, L pharmacophore[interaction] = [center_x, center_y, center_z] return pharmacophore - def generate_pharmacophore_vectors(self, interactions: List[str]) -> Dict[str, List[List[float]]]: + def generate_pharmacophore_vectors( + self, interactions: List[str] + ) -> Dict[str, List[List[float]]]: """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors. Args: @@ -332,7 +343,7 @@ def generate_bindingmode_pharmacophore( self, dict_bindingmode: Dict[str, Dict[str, List[List[float]]]], outname: str, - id_num: int = 0 + id_num: int = 0, ) -> None: """Generates pharmacophore from a binding mode and writes it to a .pml file. @@ -366,10 +377,12 @@ def generate_bindingmode_pharmacophore( for interaction, coords in dict_bindingmode.items(): # Get feature type - feature_type = next((ft for it, ft in feature_types.items() if it in interaction), None) + feature_type = next( + (ft for it, ft in feature_types.items() if it in interaction), None + ) if feature_type is None: continue - + # Generate vector features if feature_type in ["HBA", "HBD"]: if feature_type == "HBA": @@ -479,7 +492,9 @@ def generate_bindingmode_pharmacophore( xml_declaration=True, ) - def generate_pharmacophore_centers_all_points(self, interactions: List[str]) -> Dict[str, List[List[float]]]: + def generate_pharmacophore_centers_all_points( + self, interactions: List[str] + ) -> Dict[str, List[List[float]]]: """Generates pharmacophore points for all interactions to generate point cloud. Args: diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py index 0c7530b1..d24c16b3 100644 --- a/openmmdl/openmmdl_analysis/preprocessing.py +++ b/openmmdl/openmmdl_analysis/preprocessing.py @@ -9,6 +9,7 @@ from openbabel import pybel import mdtraj as md + class Preprocessing: def __init__(self): pass @@ -31,9 +32,31 @@ def renumber_protein_residues( ref_top_df, _ = traj_reference.topology.to_dataframe() protein_residues = [ - "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", - "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", - "ACE", "NME", "HIE", "HID", "HIP" + "ALA", + "ARG", + "ASN", + "ASP", + "CYS", + "GLN", + "GLU", + "GLY", + "HIS", + "ILE", + "LEU", + "LYS", + "MET", + "PHE", + "PRO", + "SER", + "THR", + "TRP", + "TYR", + "VAL", + "ACE", + "NME", + "HIE", + "HID", + "HIP", ] for chain_id in ref_top_df["chainID"].unique(): @@ -41,10 +64,13 @@ def renumber_protein_residues( ref_top_df[ (ref_top_df["chainID"] == chain_id) & ref_top_df["resName"].isin(protein_residues) - ]["resSeq"].values - 1 + ]["resSeq"].values + - 1 ) - mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(protein_residues) + mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin( + protein_residues + ) input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1 new_top = md.Topology.from_dataframe(input_top_df) @@ -113,7 +139,9 @@ def extract_and_save_ligand_as_sdf( ligand_atoms = u.select_atoms(f"resname {target_resname}") if len(ligand_atoms) == 0: - print(f"No ligand with residue name '{target_resname}' found in the PDB file.") + print( + f"No ligand with residue name '{target_resname}' found in the PDB file." + ) return ligand_universe = mda.Merge(ligand_atoms) @@ -149,7 +177,9 @@ def renumber_atoms_in_residues( if residue_name == lig_name: element_match = re.match(r"([A-Z]+)", atom_name) element = element_match.group(1) if element_match else atom_name - lig_residue_elements[element] = lig_residue_elements.get(element, 0) + 1 + lig_residue_elements[element] = ( + lig_residue_elements.get(element, 0) + 1 + ) new_atom_name = f"{element}{lig_residue_elements[element]}" line = line[:12] + f"{new_atom_name:4}" + line[16:] diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py index d630f30a..694b0337 100644 --- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py +++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py @@ -7,7 +7,8 @@ import pylab import os import MDAnalysis as mda - +from typing import List, Dict, Tuple +from typing import List, Optional class LigandImageGenerator: def __init__( @@ -104,8 +105,8 @@ def generate_image(self) -> None: print(f"Error: {e}") -import MDAnalysis as mda -from typing import List, Dict, Tuple + + class InteractionProcessor: def __init__(self, complex_pdb_file: str, ligand_no_h_pdb_file: str): @@ -141,8 +142,20 @@ def split_interaction_data(self, data: List[str]) -> List[str]: split_data.append(split_value) return split_data - def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> Tuple[ - List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int] + def highlight_numbers( + self, split_data: List[str], starting_idx: List[int] + ) -> Tuple[ + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], + List[int], ]: """Extracts the data from the split_data output of the interactions and categorizes it to its respective list. @@ -209,7 +222,9 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T if atom_name is None: continue lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + lig_real_index = ( + lig_atom[0].id - 1 if len(lig_atom) > 0 else None + ) if lig_real_index is not None: highlighted_pistacking.append(lig_real_index) @@ -223,11 +238,15 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T for code in split_codes: atom_index = int(code) complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = complex_id[0].name if len(complex_id) > 0 else None + atom_name = ( + complex_id[0].name if len(complex_id) > 0 else None + ) if atom_name is None: continue lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + lig_real_index = ( + lig_atom[0].id - 1 if len(lig_atom) > 0 else None + ) if lig_real_index is not None: highlighted_ni.append(lig_real_index) @@ -235,11 +254,15 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T for code in numeric_codes: atom_index = int(code) complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = complex_id[0].name if len(complex_id) > 0 else None + atom_name = ( + complex_id[0].name if len(complex_id) > 0 else None + ) if atom_name is None: continue lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None + lig_real_index = ( + lig_atom[0].id - 1 if len(lig_atom) > 0 else None + ) if lig_real_index is not None: highlighted_pi.append(lig_real_index) @@ -270,7 +293,9 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T highlighted_metal, ) - def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> Dict[int, Tuple[float, float, float]]: + def generate_interaction_dict( + self, interaction_type: str, keys: List[int] + ) -> Dict[int, Tuple[float, float, float]]: """Generates a dictionary of interaction RGB color model based on the provided interaction type. Args: @@ -297,11 +322,13 @@ def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> D if interaction_type not in interaction_dict: raise ValueError(f"Unknown interaction type: {interaction_type}") - return { - int(key): interaction_dict[interaction_type] for key in keys - } + return {int(key): interaction_dict[interaction_type] for key in keys} - def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *source_dicts: Dict[int, Tuple[float, float, float]]): + def update_dict( + self, + target_dict: Dict[int, Tuple[float, float, float]], + *source_dicts: Dict[int, Tuple[float, float, float]], + ): """Updates the dictionary with the keys and values from other dictionaries. Args: @@ -321,18 +348,13 @@ def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *sourc target_dict[key] = value -from typing import List, Optional -from PIL import Image -import pylab -import os - class ImageMerger: def __init__( - self, - binding_mode: str, - occurrence_percent: float, - split_data: List[str], - merged_image_paths: List[str] + self, + binding_mode: str, + occurrence_percent: float, + split_data: List[str], + merged_image_paths: List[str], ) -> None: self.binding_mode = binding_mode self.occurrence_percent = occurrence_percent @@ -361,7 +383,9 @@ def create_and_merge_images(self) -> List[str]: for i, data in enumerate(filtered_split_data): y = data_points[i] label = data.split()[-1] - type_ = data.split()[-2] # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function + type_ = data.split()[ + -2 + ] # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function if label == "hydrophobic": (line,) = ax.plot( x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0 diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py index cbcc4505..4715fbc1 100644 --- a/openmmdl/openmmdl_analysis/rmsd_calculation.py +++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py @@ -27,7 +27,9 @@ class RMSDAnalyzer: def __init__(self, prot_lig_top_file: str, prot_lig_traj_file: str) -> None: self.prot_lig_top_file: str = prot_lig_top_file self.prot_lig_traj_file: str = prot_lig_traj_file - self.universe: mda.Universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file) + self.universe: mda.Universe = mda.Universe( + prot_lig_top_file, prot_lig_traj_file + ) def rmsd_for_atomgroups( self, fig_type: str, selection1: str, selection2: Optional[List[str]] = None diff --git a/openmmdl/openmmdl_analysis/visualization.ipynb b/openmmdl/openmmdl_analysis/visualization.ipynb index 8040c014..ab4e7b09 100644 --- a/openmmdl/openmmdl_analysis/visualization.ipynb +++ b/openmmdl/openmmdl_analysis/visualization.ipynb @@ -16,10 +16,10 @@ "metadata": {}, "outputs": [], "source": [ - "ligname = 'UNK' # add the name of the ligand from the original PDB File\n", - "special = None # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n", - "md = mda.Universe('PATH_TO_interacting_waters.pdb', 'PATH_TO_interacting_waters.dcd')\n", - "cloud = 'PATH_TO_clouds.json'\n", + "ligname = \"UNK\" # add the name of the ligand from the original PDB File\n", + "special = None # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n", + "md = mda.Universe(\"PATH_TO_interacting_waters.pdb\", \"PATH_TO_interacting_waters.dcd\")\n", + "cloud = \"PATH_TO_clouds.json\"\n", "\n", "visualizer = Visualizer(md, cloud, ligname, special)" ] @@ -61,7 +61,7 @@ "source": [ "# Predefined variables: receptor_type=\"protein or nucleic\", height=\"1000px\", width=\"1000px\"\n", "view = visualizer.visualize()\n", - "view.display(gui=True, style='ngl')" + "view.display(gui=True, style=\"ngl\")" ] } ], diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py index 4b4f8441..37079ef2 100644 --- a/openmmdl/openmmdl_analysis/visualization_functions.py +++ b/openmmdl/openmmdl_analysis/visualization_functions.py @@ -11,7 +11,9 @@ class TrajectorySaver: - def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool) -> None: + def __init__( + self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool + ) -> None: """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type. Args: @@ -25,7 +27,9 @@ def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bo self.special = special self.nucleic = nucleic - def save_interacting_waters_trajectory(self, interacting_waters: List[int], outputpath: str = './Visualization/') -> None: + def save_interacting_waters_trajectory( + self, interacting_waters: List[int], outputpath: str = "./Visualization/" + ) -> None: """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters. Args: @@ -48,7 +52,9 @@ def save_interacting_waters_trajectory(self, interacting_waters: List[int], outp for ts in self.pdb_md.trajectory: W.write(water_atoms) - def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None) -> None: + def save_frame( + self, frame: int, outpath: str, selection: Optional[str] = None + ) -> None: """Saves a single frame of the trajectory. Args: @@ -65,13 +71,21 @@ def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None) class Visualizer: - def __init__(self, md: mda.Universe, cloud_path: str, ligname: str, special: Optional[str] = None) -> None: + def __init__( + self, + md: mda.Universe, + cloud_path: str, + ligname: str, + special: Optional[str] = None, + ) -> None: self.md = md self.cloud = self.load_cloud(cloud_path) self.ligname = ligname self.special = special - def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float], List[int]]]]: + def load_cloud( + self, cloud_path: str + ) -> Dict[str, Dict[str, Union[List[float], List[int]]]]: """Loads interaction cloud data from a JSON file. Args: @@ -85,7 +99,10 @@ def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float], return data def visualize( - self, receptor_type: str = "protein or nucleic", height: str = "1000px", width: str = "1000px" + self, + receptor_type: str = "protein or nucleic", + height: str = "1000px", + width: str = "1000px", ) -> nv.NGLWidget: """Generates visualization of the trajectory with the interacting waters and interaction clouds. From babc360b47ca459ceb81b034916f5ce93d52f031 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:40:12 +0200 Subject: [PATCH 13/23] Update barcode_generation.py adjusted changes to fix some errors --- openmmdl/openmmdl_analysis/barcode_generation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py index ee804f9b..8bf6c78b 100644 --- a/openmmdl/openmmdl_analysis/barcode_generation.py +++ b/openmmdl/openmmdl_analysis/barcode_generation.py @@ -82,7 +82,7 @@ def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]: water_id_list = [] waterid_barcode = [] - for _, row in self.df.iterrows(): + for index, row in self.df.iterrows(): if row[interaction] == 1: water_id_list.append(int(float(row["WATER_IDX"]))) @@ -296,11 +296,11 @@ def plot_barcodes_grouped( ligatom.pop(-1) ligatom = "_".join(ligatom) ligatoms_dict.setdefault(ligatom, []).append(interaction) - - total_interactions: Dict[str, np.ndarray] = {} - for ligatom, interactions_list in ligatoms_dict.items(): - ligatom_interaction_barcodes: Dict[str, np.ndarray] = {} - for interaction in interactions_list: + + total_interactions = {} + for ligatom in ligatoms_dict: + ligatom_interaction_barcodes = {} + for interaction in ligatoms_dict[ligatom]: barcode = self.barcode_gen.generate_barcode(interaction) ligatom_interaction_barcodes[interaction] = barcode os.makedirs(f"./Barcodes/{ligatom}", exist_ok=True) From ed3adfb2ddfe881cf23cf929e5c57273bad6ce15 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:05:44 +0200 Subject: [PATCH 14/23] Update binding_mode_processing.py returned to original --- .../binding_mode_processing.py | 399 +++++++++++------- 1 file changed, 252 insertions(+), 147 deletions(-) diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py index 135694d0..98a90c1a 100644 --- a/openmmdl/openmmdl_analysis/binding_mode_processing.py +++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py @@ -7,47 +7,41 @@ from tqdm import tqdm from pathlib import Path from numba import jit -from typing import Dict, List, Union, Optional, Tuple class BindingModeProcesser: + def __init__( - self, - pdb_md, - ligand: str, - peptide: Optional[str], - special: str, - ligand_rings: List[List[int]], - interaction_list: pd.DataFrame, - threshold: float, - ) -> None: + self, pdb_md, ligand, peptide, special, ligand_rings, interaction_list, treshold + ): self.pdb_md = pdb_md self.ligand = ligand self.peptide = peptide self.special = special self.ligand_rings = ligand_rings self.unique_columns_rings_grouped = self.gather_interactions(interaction_list) - self.interaction_list, self.unique_data = self.process_interaction_wrapper( - interaction_list, threshold / 100 + self.interaction_list, self.unique_data = self.process_interaction_wraper( + interaction_list, (treshold / 100) ) - self.interactions_all, self.unique_data_all = self.process_interaction_wrapper( + self.interactions_all, self.unique_data_all = self.process_interaction_wraper( interaction_list.copy(), 0.00001 ) - def process_interaction_wrapper( - self, interaction_list: pd.DataFrame, threshold: float - ) -> Tuple[pd.DataFrame, Dict[str, str]]: - filtered_values = self.filtering_values(threshold, interaction_list) + def process_interaction_wraper(self, interaction_list, treshold): + + filtered_values = self.filtering_values(treshold, interaction_list) interaction_list.fillna(0, inplace=True) unique_data = self.unique_data_generation(filtered_values) self.df_iteration_numbering(interaction_list, unique_data) return interaction_list, unique_data - def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: + def gather_interactions(self, df): """Process a DataFrame with the protein-ligand interaction and generate column names for each unique interaction. Args: - df (pandas.DataFrame): DataFrame that contains the interaction data for the whole trajectory. + df (pandas dataframe): DataFrame that contains the interaction data for the whole trajectory. + ligand_rings (list): A list of the ligand ring information to recognize the atom numbers belonging to rings for hydrophobic interactions. + peptide (str, optional): chainid of the peptide in the topology. Defaults to None. Returns: dict: A dictionary with the keys being 'FRAME' numbers and values being dictionaries containing row indices and their corresponding unique column names for interactions. @@ -55,17 +49,23 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: unique_columns_rings = {} unique_columns_rings_grouped = {} + # Iterate through the rows of the DataFrame if self.peptide is None: for index, row in df.iterrows(): - interaction = row["INTERACTION"] - prot_partner = row["Prot_partner"] - - if interaction == "hydrophobic": + # Check if the 'INTERACTION' is 'hydrophobic' + if row["INTERACTION"] == "hydrophobic": + # Get the values from the current row + prot_partner = row["Prot_partner"] ligcarbonidx = row["LIGCARBONIDX"] + interaction = row["INTERACTION"] ring_found = False + # Concatenate the values to form a unique column name for ligand_ring in self.ligand_rings: if ligcarbonidx in ligand_ring: - numbers_as_strings = [str(idx) for idx in ligand_ring] + numbers_as_strings = [ + str(ligcarbonidx) for ligcarbonidx in ligand_ring + ] + # Create the name with numbers separated by underscores name_with_numbers = "_".join(numbers_as_strings) col_name = ( f"{prot_partner}_{name_with_numbers}_{interaction}" @@ -73,121 +73,189 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]: ring_found = True break if not ring_found: + ligcarbonidx = int(row["LIGCARBONIDX"]) col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" - elif interaction == "hbond": - if row["PROTISDON"]: + elif row["INTERACTION"] == "hbond": + if row["PROTISDON"] == True: + prot_partner = row["Prot_partner"] ligcarbonidx = int(row["ACCEPTORIDX"]) - type_ = "Acceptor" - else: + interaction = row["INTERACTION"] + type = "Acceptor" + elif row["PROTISDON"] == False: + prot_partner = row["Prot_partner"] ligcarbonidx = int(row["DONORIDX"]) - type_ = "Donor" - col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" - elif interaction == "halogen": + interaction = row["INTERACTION"] + type = "Donor" + # Concatenate the values to form a unique column name + col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" + elif row["INTERACTION"] == "halogen": + prot_partner = row["Prot_partner"] ligcarbonidx = int(row["DON_IDX"]) halogen = row["DONORTYPE"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}" - elif interaction == "waterbridge": - if row["PROTISDON"]: + elif row["INTERACTION"] == "waterbridge": + if row["PROTISDON"] == True: + prot_partner = row["Prot_partner"] ligcarbonidx = int(row["ACCEPTOR_IDX"]) - type_ = "Acceptor" - else: + interaction = row["INTERACTION"] + type = "Acceptor" + # Concatenate the values to form a unique column name + col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" + elif row["PROTISDON"] == False: + prot_partner = row["Prot_partner"] ligcarbonidx = int(row["DONOR_IDX"]) - type_ = "Donor" - col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" - elif interaction == "pistacking": + interaction = row["INTERACTION"] + type = "Donor" + # Concatenate the values to form a unique column name + col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" + elif row["INTERACTION"] == "pistacking": + prot_partner = row["Prot_partner"] ligcarbonidx = row["LIG_IDX_LIST"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" - elif interaction == "pication": + elif row["INTERACTION"] == "pication": + prot_partner = row["Prot_partner"] ligidx = row["LIG_IDX_LIST"] ligtype = row["LIG_GROUP"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") - elif interaction == "saltbridge": + elif row["INTERACTION"] == "saltbridge": + prot_partner = row["Prot_partner"] ligidx = row["LIG_IDX_LIST"] lig_group = row["LIG_GROUP"] - if row["PROTISPOS"]: - type_ = "NI" - else: - type_ = "PI" - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" - ) - elif interaction == "metal": + interaction = row["INTERACTION"] + if row["PROTISPOS"] == True: + type = "NI" + # Concatenate the values to form a unique column name + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" + ) + elif row["PROTISPOS"] == False: + type = "PI" + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" + ) + elif row["INTERACTION"] == "metal": special_ligand = row["RESTYPE_LIG"] ligcarbonidx = int(row["TARGET_IDX"]) metal_type = row["METAL_TYPE"] coordination = row["COORDINATION"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" - frame_value = row["FRAME"] if frame_value not in unique_columns_rings_grouped: unique_columns_rings_grouped[frame_value] = {} - if interaction != "skip": + if row["INTERACTION"] != "skip": unique_columns_rings_grouped[frame_value][index] = col_name + # Add the column name and its value to the dictionary unique_columns_rings[index] = col_name - - else: # If peptide is not None + if self.peptide is not None: for index, row in df.iterrows(): - interaction = row["INTERACTION"] - prot_partner = row["Prot_partner"] - peptide_partner = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" - - if interaction == "hydrophobic": + # Check if the 'INTERACTION' is 'hydrophobic' + if row["INTERACTION"] == "hydrophobic": + # Get the values from the current row + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + ring_found = False col_name = f"{prot_partner}_{peptide_partner}_{interaction}" - elif interaction == "hbond": - if row["PROTISDON"]: - type_ = "Acceptor" - else: - type_ = "Donor" - col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" - elif interaction == "halogen": + elif row["INTERACTION"] == "hbond": + if row["PROTISDON"] == True: + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + type = "Acceptor" + elif row["PROTISDON"] == False: + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + type = "Donor" + # Concatenate the values to form a unique column name + col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}" + elif row["INTERACTION"] == "halogen": + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] halogen = row["DONORTYPE"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = ( f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" ) - elif interaction == "waterbridge": - if row["PROTISDON"]: - type_ = "Acceptor" - else: - type_ = "Donor" - col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" - elif interaction == "pistacking": + elif row["INTERACTION"] == "waterbridge": + if row["PROTISDON"] == True: + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + type = "Acceptor" + # Concatenate the values to form a unique column name + col_name = ( + f"{prot_partner}_{peptide_partner}_{type}_{interaction}" + ) + elif row["PROTISDON"] == False: + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + type = "Donor" + # Concatenate the values to form a unique column name + col_name = ( + f"{prot_partner}_{peptide_partner}_{type}_{interaction}" + ) + elif row["INTERACTION"] == "pistacking": + prot_partner = row["Prot_partner"] + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{peptide_partner}_{interaction}" - elif interaction == "pication": - ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" + elif row["INTERACTION"] == "pication": + prot_partner = row["Prot_partner"] + ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] ligtype = row["RESTYPE_LIG"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") - elif interaction == "saltbridge": - ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" + elif row["INTERACTION"] == "saltbridge": + prot_partner = row["Prot_partner"] + ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] lig_group = row["RESTYPE_LIG"] - if row["PROTISPOS"]: - type_ = "NI" - else: - type_ = "PI" - col_name = ( - f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" - ) - elif interaction == "metal": + interaction = row["INTERACTION"] + if row["PROTISPOS"] == True: + type = "NI" + # Concatenate the values to form a unique column name + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" + ) + elif row["PROTISPOS"] == False: + type = "PI" + col_name = ( + f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}" + ) + elif row["INTERACTION"] == "metal": special_ligand = row["RESTYPE_LIG"] - ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}" + ligcarbonidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] metal_type = row["METAL_TYPE"] coordination = row["COORDINATION"] + interaction = row["INTERACTION"] + # Concatenate the values to form a unique column name col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" - frame_value = row["FRAME"] if frame_value not in unique_columns_rings_grouped: unique_columns_rings_grouped[frame_value] = {} - if interaction != "skip": + if row["INTERACTION"] != "skip": unique_columns_rings_grouped[frame_value][index] = col_name + # Add the column name and its value to the dictionary unique_columns_rings[index] = col_name - print("\033[1minteraction partners generated\033[0m") + return unique_columns_rings_grouped - def remove_duplicate_values( - self, data: Dict[int, Dict[int, str]] - ) -> Dict[int, Dict[int, str]]: + def remove_duplicate_values(self, data): """Remove the duplicate values from sub-dictionaries within the input dictionary. Args: @@ -212,9 +280,7 @@ def remove_duplicate_values( return unique_data - def combine_subdict_values( - self, data: Dict[int, Dict[int, str]] - ) -> Dict[str, List[str]]: + def combine_subdict_values(self, data): """Combines the values from the individual sub-dictionaries into a single list. Args: @@ -229,37 +295,52 @@ def combine_subdict_values( return combined_data - def filtering_values(self, threshold: float, df: pd.DataFrame) -> List[str]: + def filtering_values(self, threshold, df): """Filter and append values (interactions) to a DataFrame based on occurrence counts. Args: - threshold (float): A threshold value used for filtering the values (interactions) based on occurrence count. - df (pandas.DataFrame): DataFrame to which the filtered values (interactions) will be added. + threshold (float): A treshold value that is used for filtering of the values (interactions) based upon the occurence count. + df (pandas dataframe): DataFrame to which the filtered values (interactions) will be added. + unique_columns_rings_grouped (dict): Dictionary containing the grouped and unique values otained from gather_interactions. Returns: - list: A list of values, with unique values and their corresponding occurrence counts. + list: A list of values, with unique values and their corresponding occurence counts. """ + frames = len(self.pdb_md.trajectory) - 1 + # Call the function to remove duplicate keys unique_data = self.remove_duplicate_values(self.unique_columns_rings_grouped) + + # Call the function to combine sub-dictionary values unique_colums_rings_all = self.combine_subdict_values(unique_data) + + # Flatten the list of values all_values = unique_colums_rings_all["all"] + # Count the occurrences of each value occurrences = {} for value in all_values: - occurrences[value] = occurrences.get(value, 0) + 1 + if value in occurrences: + occurrences[value] += 1 + else: + occurrences[value] = 1 + # Calculate the threshold (20% of 1000) threshold = threshold * frames + + # Filter out values that appear in less than 20% of 1000 filtered_values = [ value for value, count in occurrences.items() if count >= threshold ] + # Append the filtered values as columns to the DataFrame for value in filtered_values: df[value] = None return filtered_values - def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]: - """Generate a dictionary containing the unique interactions from a list of filtered values obtained by filtering_values. + def unique_data_generation(self, filtered_values): + """Generate a dictionary conataing the unique interactions from a list of filtered values obtained by filtering_values. Args: filtered_values (list): A list of values, where the unique interactions are extracted from. @@ -267,21 +348,25 @@ def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]: Returns: dict: A dictionary containing the filtered unique interactions. """ + # Create a new dictionary to store unique values unique_data = {} + + # Loop through the elements of the data_list for element in filtered_values: + # Check if the element is not already in the unique_data dictionary keys if element not in unique_data: + # If not, add the element as both key and value unique_data[element] = element return unique_data - def df_iteration_numbering( - self, df: pd.DataFrame, unique_data: Dict[str, str] - ) -> None: + def df_iteration_numbering(self, df, unique_data): """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present. Args: - df (pd.DataFrame): DataFrame which has the interaction data for all of the frames. - unique_data (Dict[str, str]): Dictionary that contains the unique interactions obtained from unique_data_generation. + df (pandas dataframe): DataFrame which has the interaction data for all of the frames. + unique_data (dict): Dictionary that contains the unique interactions obtained from unique_data_generation. + peptide (str, optional): name of the peptide chainid in the original topology. Defaults to None. """ if self.peptide is None: for index, row in df.iterrows(): @@ -297,11 +382,13 @@ def df_iteration_numbering( row["INTERACTION"] == interaction ) df.at[index, col] = 1 if condition else 0 + else: + continue elif row["INTERACTION"] == "hbond": if row["PROTISDON"] == True: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, _type, interaction = ( + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) ligcarbonidx = int(ligcarbonidx) @@ -314,7 +401,7 @@ def df_iteration_numbering( elif row["PROTISDON"] == False: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, _type, interaction = ( + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) ligcarbonidx = int(ligcarbonidx) @@ -351,8 +438,8 @@ def df_iteration_numbering( elif row["INTERACTION"] == "waterbridge": for col in unique_data.values(): if "waterbridge" in col: - if row["PROTISDON"]: - prot_partner, ligcarbonidx, _type, interaction = ( + if row["PROTISDON"] == True: + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) condition = ( @@ -361,8 +448,8 @@ def df_iteration_numbering( & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - else: - prot_partner, ligcarbonidx, _type, interaction = ( + elif row["PROTISDON"] == False: + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) condition = ( @@ -395,7 +482,7 @@ def df_iteration_numbering( ligidx = parts[1:-3] ligidx = ",".join(ligidx) lig_group = parts[-3] - _type = parts[-2] + type = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) @@ -419,8 +506,7 @@ def df_iteration_numbering( & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - - else: # If self.peptide is not None + if self.peptide is not None: for index, row in df.iterrows(): if row["INTERACTION"] == "hydrophobic": for col in unique_data.values(): @@ -434,11 +520,13 @@ def df_iteration_numbering( row["INTERACTION"] == interaction ) df.at[index, col] = 1 if condition else 0 + else: + continue elif row["INTERACTION"] == "hbond": - if row["PROTISDON"]: + if row["PROTISDON"] == True: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, _type, interaction = ( + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) condition = ( @@ -450,10 +538,10 @@ def df_iteration_numbering( & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - else: + elif row["PROTISDON"] == False: for col in unique_data.values(): if "hbond" in col: - prot_partner, ligcarbonidx, _type, interaction = ( + prot_partner, ligcarbonidx, type, interaction = ( col.split("_") ) condition = ( @@ -497,31 +585,46 @@ def df_iteration_numbering( elif row["INTERACTION"] == "waterbridge": for col in unique_data.values(): if "waterbridge" in col: - prot_partner, ligcarbonidx, _type, interaction = col.split( - "_" - ) - condition = ( - (row["Prot_partner"] == prot_partner) - & ( - (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) - == ligcarbonidx + if row["PROTISDON"] == True: + prot_partner, ligcarbonidx, type, interaction = ( + col.split("_") ) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 + condition = ( + (row["Prot_partner"] == prot_partner) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligcarbonidx + ) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif row["PROTISDON"] == False: + prot_partner, ligcarbonidx, type, interaction = ( + col.split("_") + ) + condition = ( + (row["Prot_partner"] == prot_partner) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligcarbonidx + ) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 elif row["INTERACTION"] == "pication": for col in unique_data.values(): if "pication" in col: parts = col.split("_") prot_partner = parts[0] - ligidx = parts[1:-2] - ligidx = ",".join(ligidx) + ligidx = parts[1] ligtype = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & (row["RESNR_LIG"] == ligidx) - & (row["RESTYPE_LIG"] == ligtype) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligidx + ) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 @@ -530,15 +633,16 @@ def df_iteration_numbering( if "saltbridge" in col: parts = col.split("_") prot_partner = parts[0] - ligidx = parts[1:-3] - ligidx = ",".join(ligidx) + ligidx = parts[1] lig_group = parts[-3] - _type = parts[-2] + type = parts[-2] interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & (row["RESNR_LIG"] == ligidx) - & (row["RESTYPE_LIG"] == lig_group) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligidx + ) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 @@ -547,26 +651,27 @@ def df_iteration_numbering( if "metal" in col: parts = col.split("_") special_ligand = parts[0] - ligidx = int(parts[1]) + ligidx = parts[1] metal_type = parts[2] interaction = parts[-1] condition = ( (row["RESTYPE_LIG"] == special_ligand) - & (int(row["TARGET_IDX"]) == ligidx) + & ( + (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) + == ligidx + ) & (row["METAL_TYPE"] == metal_type) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - def update_values( - self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str] - ) -> None: - """Update the values in the input DataFrame based upon the frame values and a reference DataFrame. + def update_values(self, df, new, unique_data): + """Update the values in the input DataFrame based upon the frame values and an reference DataFrame. Args: - df (pandas.DataFrame): Input DataFrame that will be updated. - new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame. - unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame. + df (pandas dataframe): Input DataFrame that will be updated. + new (pandas dataframe): The reference DataFrame containing values that are used to update the input DataFrame. + unique_data (dict): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame. """ for idx, row in df.iterrows(): frame_value = row["FRAME"] From 17c4eca7ce2a0e036d3ae10c579aa1ce556ce853 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Fri, 25 Oct 2024 19:40:40 +0200 Subject: [PATCH 15/23] Update rmsd_calculation.py adjusted code, removed duplicate mention of frames present from the earlier code --- .../openmmdl_analysis/rmsd_calculation.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py index 4715fbc1..65d64f76 100644 --- a/openmmdl/openmmdl_analysis/rmsd_calculation.py +++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py @@ -50,7 +50,7 @@ def rmsd_for_atomgroups( self.universe, ref, select=selection1, groupselections=selection2 ) rmsd_analysis.run() - columns = [selection1] + selection2 if selection2 else [selection1] + columns = [selection1, *selection2] if selection2 else [selection1] rmsd_df = pd.DataFrame(np.round(rmsd_analysis.rmsd[:, 2:], 2), columns=columns) rmsd_df.index.name = "frame" @@ -123,7 +123,7 @@ def rmsd_dist_frames( img1, ax=ax, orientation="horizontal", fraction=0.1, label="RMSD (Å)" ) - plt.savefig(f"./RMSD/RMSD_between_the_frames.{fig_type}") + plt.savefig(f"{output_directory}/RMSD_between_the_frames.{fig_type}") return pairwise_rmsd_prot, pairwise_rmsd_lig def calc_rmsd_2frames(self, ref: np.ndarray, frame: np.ndarray) -> float: @@ -152,25 +152,24 @@ def calculate_distance_matrix(self, selection: str) -> np.ndarray: return distances def calculate_representative_frame( - self, bmode_frames: List[int], DM: np.ndarray + self, binding_mode_frames: List[int], distance_matrix: np.ndarray ) -> int: """Calculates the most representative frame for a binding mode. This is based upon the average RMSD of a frame to all other frames in the binding mode. Args: - bmode_frames (list): List of frames belonging to a binding mode. - DM (np.array): Distance matrix of trajectory. + binding_mode_frames (list): List of frames belonging to a binding mode. + distance_matrix (np.array): Distance matrix of trajectory. Returns: int: Number of the most representative frame. """ - frames: List[int] = bmode_frames mean_rmsd_per_frame: dict = {} - for frame_i in frames: + for frame_i in binding_mode_frames: mean_rmsd_per_frame[frame_i] = 0 - for frame_j in frames: + for frame_j in binding_mode_frames: if frame_j != frame_i: - mean_rmsd_per_frame[frame_i] += DM[frame_i - 1, frame_j - 1] - mean_rmsd_per_frame[frame_i] /= len(frames) + mean_rmsd_per_frame[frame_i] += distance_matrix[frame_i - 1, frame_j - 1] + mean_rmsd_per_frame[frame_i] /= len(binding_mode_frames) repre: int = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get) From 1f56384563909d7f0ab1948f3fcd0e563605bf32 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Fri, 25 Oct 2024 19:52:08 +0200 Subject: [PATCH 16/23] Update visualization_functions.py looks good, removed the NV.Widget, since I am not sure if it is the correct object and the type is not that important here what nglview object it is --- openmmdl/openmmdl_analysis/visualization_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py index 37079ef2..8f935f7b 100644 --- a/openmmdl/openmmdl_analysis/visualization_functions.py +++ b/openmmdl/openmmdl_analysis/visualization_functions.py @@ -14,7 +14,7 @@ class TrajectorySaver: def __init__( self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool ) -> None: - """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type. + """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name and receptor type. Args: pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory. @@ -30,7 +30,7 @@ def __init__( def save_interacting_waters_trajectory( self, interacting_waters: List[int], outputpath: str = "./Visualization/" ) -> None: - """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters. + """Saves .pdb and .dcd files of the trajectory containing ligand, receptor and all interacting waters. Args: interacting_waters (List[int]): List of all interacting water IDs. @@ -103,7 +103,7 @@ def visualize( receptor_type: str = "protein or nucleic", height: str = "1000px", width: str = "1000px", - ) -> nv.NGLWidget: + ): """Generates visualization of the trajectory with the interacting waters and interaction clouds. Args: @@ -112,7 +112,7 @@ def visualize( width (str, optional): Width of the visualization. Defaults to '1000px'. Returns: - nv.NGLWidget: Returns an nglview.widget object containing the visualization. + nglview widget: Returns an nglview.widget object containing the visualization. """ sphere_buffers = [] From cff415ccbc84df00069e8353f8ddd9f1c988fdd5 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Fri, 25 Oct 2024 20:18:14 +0200 Subject: [PATCH 17/23] Update preprocessing.py added the comments back in and adjusted small parts --- openmmdl/openmmdl_analysis/preprocessing.py | 52 +++++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py index d24c16b3..db79d43d 100644 --- a/openmmdl/openmmdl_analysis/preprocessing.py +++ b/openmmdl/openmmdl_analysis/preprocessing.py @@ -1,13 +1,13 @@ import MDAnalysis as mda +import mdtraj as md import os import re from typing import List, Optional import rdkit from rdkit import Chem -from rdkit.Chem import AllChem -from rdkit.Chem import Mol +from rdkit.Chem import AllChem, Mol from openbabel import pybel -import mdtraj as md + class Preprocessing: @@ -25,12 +25,16 @@ def renumber_protein_residues( reference_pdb (str): Path to the reference PDB file representing the molecular dynamics trajectory used as a reference. output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved. """ + + # Load trajectories traj_input = md.load(input_pdb) traj_reference = md.load(reference_pdb) + # Get the topology DataFrames input_top_df, _ = traj_input.topology.to_dataframe() ref_top_df, _ = traj_reference.topology.to_dataframe() + # List of common protein residue names including additional residues protein_residues = [ "ALA", "ARG", @@ -59,7 +63,9 @@ def renumber_protein_residues( "HIP", ] + # Iterate over each chain in the reference topology for chain_id in ref_top_df["chainID"].unique(): + # Extract residue indices from the reference topology for the current chain ref_residue_indices = ( ref_top_df[ (ref_top_df["chainID"] == chain_id) @@ -68,14 +74,20 @@ def renumber_protein_residues( - 1 ) + # Update residue indices in the input topology DataFrame for the current chain mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin( protein_residues ) input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1 + # Create a new topology from the modified DataFrame new_top = md.Topology.from_dataframe(input_top_df) + + # Update the topology in the input trajectory new_traj = traj_input.slice(range(traj_input.n_frames)) new_traj.topology = new_top + + # Save the renumbered trajectory to a new PDB file new_traj.save(output_pdb) def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]: @@ -93,14 +105,18 @@ def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]: def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None: """ - Converts ligand structures from an SDF file to SMILES format. + Converts ligand structures from an SDF file to SMILES :) format. Args: input_sdf (str): Path to the SDF file with the ligand that will be converted. output_smi (str): Path to the output SMILES file. """ + # Create a molecule supplier from an SDF file mol_supplier = Chem.SDMolSupplier(input_sdf) + + # Open the output SMILES file for writing with open(output_smi, "w") as output_file: + # Iterate through molecules and convert each to SMILES for mol in mol_supplier: if mol is not None: smiles = Chem.MolToSmiles(mol) @@ -110,18 +126,22 @@ def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None: def process_pdb_file(self, input_pdb_filename: str) -> None: """ - Process a PDB file to make it compatible with the openmmdl_analysis package. + Process a PDB file to make it compatible with OpenMMDL Analysis. Args: input_pdb_filename (str): Path to the input PDB file. """ + # Load the PDB file u = mda.Universe(input_pdb_filename) + # Iterate through the topology to modify residue names for atom in u.atoms: resname = atom.resname + # Check if the residue name is one of the specified water names if resname in ["SPC", "TIP3", "TIP4", "WAT", "T3P", "T4P", "T5P"]: atom.residue.resname = "HOH" elif resname == "*": atom.residue.resname = "UNK" + # Save the modified topology to a new PDB file u.atoms.write(input_pdb_filename) def extract_and_save_ligand_as_sdf( @@ -135,7 +155,10 @@ def extract_and_save_ligand_as_sdf( output_filename (str): Name of the output SDF file. target_resname (str): Residue name of the ligand in the original PDB file. """ + # Load the PDB file using MDAnalysis u = mda.Universe(input_pdb_filename) + + # Find the ligand by its residue name ligand_atoms = u.select_atoms(f"resname {target_resname}") if len(ligand_atoms) == 0: @@ -144,11 +167,14 @@ def extract_and_save_ligand_as_sdf( ) return + # Create a new Universe with only the ligand ligand_universe = mda.Merge(ligand_atoms) ligand_universe.atoms.write("lig.pdb") + # use openbabel to convert pdb to sdf mol = next(pybel.readfile("pdb", "lig.pdb")) mol.write("sdf", output_filename, overwrite=True) + # remove the temporary pdb file os.remove("lig.pdb") def renumber_atoms_in_residues( @@ -162,6 +188,7 @@ def renumber_atoms_in_residues( output_pdb_file (str): Path to the output PDB file. lig_name (str): Name of the ligand in the input PDB file. """ + # Read the input PDB file with open(input_pdb_file, "r") as f: pdb_lines = f.readlines() @@ -170,21 +197,33 @@ def renumber_atoms_in_residues( for line in pdb_lines: if line.startswith("ATOM"): + # Extract information from the ATOM line atom_serial = int(line[6:11]) atom_name = line[12:16].strip() residue_name = line[17:20].strip() + # Check if the residue is residue_name if residue_name == lig_name: + + # Extract the element from the atom name (assuming it starts with a capital letter) element_match = re.match(r"([A-Z]+)", atom_name) element = element_match.group(1) if element_match else atom_name + + # Increment the count for the current element in the current residue_name residue lig_residue_elements[element] = ( lig_residue_elements.get(element, 0) + 1 ) + + # Update the atom name based on the element count new_atom_name = f"{element}{lig_residue_elements[element]}" + + # Replace the old atom name with the new one line = line[:12] + f"{new_atom_name:4}" + line[16:] + # Append the line to the new PDB lines new_pdb_lines.append(line) + # Write the modified PDB lines to the output file with open(output_pdb_file, "w") as f: f.writelines(new_pdb_lines) @@ -201,7 +240,10 @@ def replace_atom_type(self, data: str) -> str: lines = data.split("\n") for i, line in enumerate(lines): if " LIG X" in line: + # Extract the last column which contains the atom type (O/N/H) atom_type = line[12:13].strip() + + # Replace 'X' with the correct atom type lines[i] = line.replace(" LIG X", f" LIG {atom_type}") return "\n".join(lines) From ac6ca37e681295b6d589b427794cfbfab79e0ddd Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Fri, 25 Oct 2024 20:43:03 +0200 Subject: [PATCH 18/23] Update barcode_generation.py adjusted small typos --- openmmdl/openmmdl_analysis/barcode_generation.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py index 8bf6c78b..931b3155 100644 --- a/openmmdl/openmmdl_analysis/barcode_generation.py +++ b/openmmdl/openmmdl_analysis/barcode_generation.py @@ -115,22 +115,13 @@ def interacting_water_ids(self, waterbridge_interactions: List[str]) -> List[int return list(set(interacting_waters)) -class BarcodeGenerator: - def __init__(self, df: pd.DataFrame): - self.df = df - - def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]: - # Placeholder implementation - return [] - - class BarcodePlotter: def __init__(self, df_all: pd.DataFrame): """ Initializes the BarcodePlotter with a dataframe and BarcodeGenerator instance. Args: - df_all (pd.DataFrame): Dataframe containing all interactions from plip analysis. + df_all (pd.DataFrame): Dataframe containing all interactions from PLIP analysis. """ self.df_all = df_all self.barcode_gen = BarcodeGenerator(df_all) @@ -195,7 +186,7 @@ def plot_waterbridge_piechart( Args: waterbridge_barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays. waterbridge_interactions (List[str]): List of water bridge interaction names. - fig_type (str): File extension for the saved figures (e.g., 'png', 'pdf'). + fig_type (str): File extension for the saved figures (e.g., 'png', 'svg'). """ if not waterbridge_barcodes: print("No Piecharts to plot.") From 4b5fcd4648a0c666d383e3173b4ebe93a17be449 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:21:35 +0100 Subject: [PATCH 19/23] Update find_stable_waters.py adjusted and added the comments back in --- .../openmmdl_analysis/find_stable_waters.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index 390d4fc1..fb75208b 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -24,13 +24,18 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: Returns: Tuple[pd.DataFrame, int]: DataFrame containing stable water coordinates and total number of frames. """ + # Get the total number of frames for the progress bar total_frames = len(self.u.trajectory) + + # Create an empty DataFrame to store stable water coordinates stable_waters = pd.DataFrame( columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"] ) + # Initialize variables for the previous frame's data prev_frame_coords = {} + # Iterate through frames with tqdm for the progress bar for ts in tqdm( self.u.trajectory, total=total_frames, @@ -39,6 +44,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: frame_num = ts.frame frame_coords = {} + # Iterate through oxygen atoms of the specified water type for atom in self.u.select_atoms("resname HOH and name O"): frame_coords[atom.index] = ( atom.position[0], @@ -46,19 +52,26 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: atom.position[2], ) + # Check if it's not the first frame if frame_num > 0: stable_coords = [] + # Iterate through the oxygen atoms in the current frame for atom_index, coords in frame_coords.items(): - prev_coords = prev_frame_coords.get(atom_index, coords) + prev_coords = prev_frame_coords.get( + atom_index, coords + ) # Get previous coordinates or use current if not found + # Calculate the distance between current and previous coordinates distance = np.linalg.norm(np.array(coords) - np.array(prev_coords)) + # If the distance is less than 1 Angstrom, consider it a stable water if distance < 1: stable_coords.append( (frame_num, atom_index, coords[0], coords[1], coords[2]) ) + # Append stable water coordinates to the stable_waters DataFrame if stable_coords: stable_waters = pd.concat( [ @@ -76,6 +89,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]: ] ) + # Update the previous frame's coordinates prev_frame_coords = frame_coords stable_waters.to_csv( @@ -95,12 +109,16 @@ def perform_clustering_and_writing( Args: stable_waters (pd.DataFrame): DataFrame containing stable water coordinates. - cluster_eps (float): DBSCAN clustering epsilon parameter. + cluster_eps (float): DBSCAN clustering epsilon parameter. This is in Angstrom in this case + and defines which Water distances should be within one cluster. total_frames (int): Total number of frames. output_directory (str): Directory where output files will be saved. """ + + # Feature extraction: XYZ coordinates X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]] + # List of percentages to iterate over percentage_values = [25, 50, 75, 90, 99] for percent in percentage_values: @@ -152,6 +170,7 @@ def write_pdb_clusters_and_representatives( pdb_lines.append(pdb_line) atom_counter += 1 + # Write the current cluster to a new PDB file output_filename = os.path.join( output_sub_directory, f"cluster_{label}.pdb" ) @@ -161,6 +180,7 @@ def write_pdb_clusters_and_representatives( pdb_file_counter += 1 + # Write representative water molecules to a PDB file representative_waters = clustered_waters.groupby("Cluster_Label").mean() representative_waters.reset_index(inplace=True) representative_filename = os.path.join( @@ -177,11 +197,15 @@ def stable_waters_pipeline(self, output_directory: str = "./stableWaters") -> No Args: output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters". """ + # Load the PDB and DCD files output_directory += "_clusterEps_" strEps = str(self.water_eps).replace(".", "") output_directory += strEps os.makedirs(output_directory, exist_ok=True) + + # Create a stable waters list by calling the process_trajectory_and_cluster function stable_waters, total_frames = self.trace_waters(output_directory) + # Now call perform_clustering_and_writing with the returned values self.perform_clustering_and_writing( stable_waters, self.water_eps, total_frames, output_directory ) @@ -205,9 +229,11 @@ def filter_and_parse_pdb(self, protein_pdb: str) -> Structure: ) ] + # Convert the list of lines to a string buffer pdb_string = "".join(lines) pdb_buffer = StringIO(pdb_string) + # Now parse the filtered lines parser = PDBParser(QUIET=True) structure = parser.get_structure("protein", pdb_buffer) @@ -219,7 +245,8 @@ def find_interacting_residues( representative_waters: pd.DataFrame, distance_threshold: float, ) -> Dict[int, List[Tuple[str, int]]]: - """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. + """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. + Use "filter_and_parse_pdb" to get the input for this function. Args: structure (Structure): Biopython PDB structure object. representative_waters (pd.DataFrame): DataFrame containing representative water coordinates. @@ -232,6 +259,7 @@ def find_interacting_residues( for model in structure: for chain in model: + # Check if the residue is a protein residue (not a heteroatom or water molecule) for residue in chain: if ( residue.id[0] == " " @@ -252,7 +280,7 @@ def find_interacting_residues( distance = np.linalg.norm(wat_coords - residue_coords) if distance < distance_threshold: - key = wat_index + key = wat_index # Assuming wat_index is the number of the water cluster if key not in interacting_residues: interacting_residues[key] = [] interacting_residues[key].append( @@ -306,9 +334,11 @@ def analyze_protein_and_water_interaction( strEps = str(cluster_eps).replace(".", "") output_directory += strEps + # Iterate over subdirectories for subdirectory in os.listdir(output_directory): subdirectory_path = os.path.join(output_directory, subdirectory) if os.path.isdir(subdirectory_path): + # Perform operations within each subdirectory representative_waters = self.read_pdb_as_dataframe( os.path.join(subdirectory_path, representative_waters_file) ) @@ -320,6 +350,7 @@ def analyze_protein_and_water_interaction( interacting_residues.items(), columns=["Cluster_Number", "Interacting_Residues"], ) + # Save result to each subdirectory result_df.to_csv( os.path.join(subdirectory_path, "interacting_residues.csv"), index=False, From f5aa320f85bddce6b3d2e3a069e12af3d4b052f2 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:46:39 +0100 Subject: [PATCH 20/23] Update interaction_gathering.py many modifications and removal of code lines, set it back to initial version --- .../interaction_gathering.py | 415 +++++++++++------- 1 file changed, 260 insertions(+), 155 deletions(-) diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py index 08d0d8f3..aa56fbb0 100644 --- a/openmmdl/openmmdl_analysis/interaction_gathering.py +++ b/openmmdl/openmmdl_analysis/interaction_gathering.py @@ -3,44 +3,37 @@ import MDAnalysis as mda from tqdm import tqdm from plip.basic import config -from plip.structure.preparation import PDBComplex, PLInteraction +from plip.structure.preparation import PDBComplex, LigandFinder, Mol, PLInteraction from plip.exchange.report import BindingSiteReport from multiprocessing import Pool from functools import partial -from typing import List, Dict, Tuple, Optional, Union config.KEEPMOD = True class InteractionAnalyzer: def __init__( - self, - pdb_md: mda.Universe, - dataframe: Optional[str], - num_processes: int, - lig_name: str, - special_ligand: Optional[str], - peptide: Optional[str], - ) -> None: + self, pdb_md, dataframe, num_processes, lig_name, special_ligand, peptide + ): self.pdb_md = pdb_md self.dataframe = dataframe self.num_processes = num_processes self.lig_name = lig_name self.special = special_ligand self.peptide = peptide - self.interaction_list = self.process_trajectory() + self.ineraction_list = self.process_trajectory() def characterize_complex( self, pdb_file: str, binding_site_id: str - ) -> Optional[PLInteraction]: - """Characterize the protein-ligand complex and return their interaction set. + ) -> PLInteraction: + """Characterize the protein-ligand complex and return their interaction set Args: - pdb_file (str): Path to the PDB file. - binding_site_id (str): Identifier of the binding site. + pdb_file (str): A string, which represents the path to the PDB File + binding_site_id (str): A string that specifies the identifier of the binding site Returns: - Optional[PLInteraction]: Interactions if found; otherwise, None. + PLInteraction: A object representing the interactions if. If Binding site is not found returns None """ pdb_complex = PDBComplex() pdb_complex.load_pdb(pdb_file) @@ -50,30 +43,31 @@ def characterize_complex( == binding_site_id ): pdb_complex.characterize_complex(ligand) - return pdb_complex.interaction_sets[binding_site_id] - return None + return pdb_complex.interaction_sets[binding_site_id] - def retrieve_plip_interactions( - self, pdb_file: str, lig_name: str - ) -> Dict[str, Dict[str, List]]: - """Retrieve interactions from PLIP. + def retrieve_plip_interactions(self, pdb_file, lig_name): + """Retrieves the interactions from PLIP. Args: - pdb_file (str): Path to the PDB file of the complex. - lig_name (str): Name of the ligand in the complex topology. + pdb_file (str): The path of the PDB file of the complex. + lig_name (str): Name of the Ligand in the complex topology that will be analyzed. Returns: - Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions. + dict: A dictionary of the binding sites and the interactions. """ protlig = PDBComplex() - protlig.load_pdb(pdb_file) + protlig.load_pdb(pdb_file) # load the pdb file for ligand in protlig.ligands: if str(ligand.longname) == lig_name: - protlig.characterize_complex(ligand) - sites: Dict[str, Dict[str, List]] = {} + protlig.characterize_complex( + ligand + ) # find ligands and analyze interactions + sites = {} + # loop over binding sites for key, site in sorted(protlig.interaction_sets.items()): - binding_site = BindingSiteReport(site) + binding_site = BindingSiteReport(site) # collect data about interactions + # tuples of *_features and *_info will be converted to pandas DataFrame keys = ( "hydrophobic", "hbond", @@ -84,6 +78,9 @@ def retrieve_plip_interactions( "halogen", "metal", ) + # interactions is a dictionary which contains relevant information for each + # of the possible interactions: hydrophobic, hbond, etc. in the considered + # binding site. interactions = { k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info") @@ -93,23 +90,26 @@ def retrieve_plip_interactions( return sites - def retrieve_plip_interactions_peptide( - self, pdb_file: str - ) -> Dict[str, Dict[str, List]]: - """Retrieve interactions from PLIP for a peptide. + def retrieve_plip_interactions_peptide(self, pdb_file): + """Retrives the interactions from PLIP for a peptide. Args: - pdb_file (str): Path to the PDB file of the complex. + pdb_file (str): The path of the PDB file of the complex. + peptide (str): Chainid of the peptide that will be analyzed. Returns: - Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions. + dict: A dictionary of the binding sites and the interactions. """ protlig = PDBComplex() - protlig.load_pdb(pdb_file) - protlig.characterize_complex(protlig.ligands[-1]) - sites: Dict[str, Dict[str, List]] = {} + protlig.load_pdb(pdb_file) # load the pdb file + protlig.characterize_complex( + protlig.ligands[-1] + ) # find ligands and analyze interactions + sites = {} + # loop over binding sites for key, site in sorted(protlig.interaction_sets.items()): - binding_site = BindingSiteReport(site) + binding_site = BindingSiteReport(site) # collect data about interactions + # tuples of *_features and *_info will be converted to pandas DataFrame keys = ( "hydrophobic", "hbond", @@ -120,6 +120,9 @@ def retrieve_plip_interactions_peptide( "halogen", "metal", ) + # interactions is a dictionary which contains relevant information for each + # of the possible interactions: hydrophobic, hbond, etc. in the considered + # binding site. interactions = { k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info") @@ -130,19 +133,18 @@ def retrieve_plip_interactions_peptide( return sites def create_df_from_binding_site( - self, - selected_site_interactions: Dict[str, List], - interaction_type: str = "hbond", - ) -> pd.DataFrame: - """Create a DataFrame from a binding site and interaction type. + self, selected_site_interactions, interaction_type="hbond" + ): + """Creates a data frame from a binding site and interaction type. Args: - selected_site_interactions (Dict[str, List]): Precalculated interactions from PLIP for the selected site. - interaction_type (str, optional): Interaction type of interest (default is "hbond"). + selected_site_interactions (dict): Precaluclated interactions from PLIP for the selected site + interaction_type (str, optional): The interaction type of interest (default set to hydrogen bond). Defaults to "hbond". Returns: - pd.DataFrame: DataFrame with information from PLIP. + pandas dataframe: DataFrame with information retrieved from PLIP. """ + # check if interaction type is valid: valid_types = [ "hydrophobic", "hbond", @@ -161,17 +163,20 @@ def create_df_from_binding_site( interaction_type = "hbond" df = pd.DataFrame.from_records( + # data is stored AFTER the column names selected_site_interactions[interaction_type][1:], + # column names are always the first element columns=selected_site_interactions[interaction_type][0], ) return df - def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None: - """Reformat the topology file to change the ligand to a residue. + def change_lig_to_residue(self, file_path, new_residue_name): + """Reformats the topology file to change the ligand to a residue. This is needed for interactions with special ligands such as metal ions. Args: - file_path (str): Path to the topology file. - new_residue_name (str): New residue name. + file_path (str): Filepath of the topology file. + old_residue_name (str): Residue name of the ligand. + new_residue_name (str): New residue name of the ligand now changed to mimic an amino acid residue. """ with open(file_path, "r") as file: lines = file.readlines() @@ -179,10 +184,14 @@ def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None: with open(file_path, "w") as file: for line in lines: if line.startswith("HETATM") or line.startswith("ATOM"): + # Assuming the standard PDB format for simplicity + # You may need to adapt this part based on your specific PDB file atom_name = line[12:16].strip() residue_name = line[17:20].strip() + # Check if the residue name matches the one to be changed if residue_name == self.lig_name: + # Change the residue name to the new one modified_line = line[:17] + new_residue_name + line[20:] file.write(modified_line) else: @@ -190,65 +199,24 @@ def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None: else: file.write(line) - def process_frame_special(self, frame: int) -> pd.DataFrame: - """Process a single frame of MD simulation for special ligands. - - Args: - frame (int): Frame number to be processed. - - Returns: - pd.DataFrame: DataFrame containing interaction data for the processed frame. - """ - atoms_selected = self.pdb_md.select_atoms( - f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" - ) - for num in self.pdb_md.trajectory[(frame) : (frame + 1)]: - atoms_selected.write(f"processing_frame_{frame}.pdb") - interaction_list = pd.DataFrame() - interactions_by_site = self.retrieve_plip_interactions( - f"processing_frame_{frame}.pdb", self.lig_name - ) - index_of_selected_site = -1 - selected_site = list(interactions_by_site.keys())[index_of_selected_site] - - interaction_types = [ - "hydrophobic", - "hbond", - "waterbridge", - "saltbridge", - "pistacking", - "pication", - "halogen", - "metal", - ] - - for interaction_type in interaction_types: - tmp_interaction = self.create_df_from_binding_site( - interactions_by_site[selected_site], - interaction_type=interaction_type, - ) - tmp_interaction["frame"] = frame - tmp_interaction["interaction_type"] = interaction_type - interaction_list = pd.concat([interaction_list, tmp_interaction]) - - os.remove(f"processing_frame_{frame}.pdb") - return interaction_list - - def process_frame(self, frame: int) -> pd.DataFrame: + def process_frame(self, frame): """Process a single frame of MD simulation. Args: - frame (int): Frame number to be processed. + frame (int): The number of the frame that will be processed. + pdb_md (mda universe): The MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed. + lig_name (str): Name of the ligand in the complex that will be analyzed. + special (str, optional): Name of the special ligand in the complex that will be analyzed. Defaults to None. + peptide (srt, optional): Chainid of the peptide that will be analyzed. Defaults to None. Returns: - pd.DataFrame: DataFrame containing interaction data for the processed frame. + pandas dataframe: A dataframe conatining the interaction data for the processed frame. """ atoms_selected = self.pdb_md.select_atoms( f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}" ) for num in self.pdb_md.trajectory[(frame) : (frame + 1)]: atoms_selected.write(f"processing_frame_{frame}.pdb") - interaction_list = pd.DataFrame() if self.peptide is None: interactions_by_site = self.retrieve_plip_interactions( f"processing_frame_{frame}.pdb", self.lig_name @@ -267,19 +235,68 @@ def process_frame(self, frame: int) -> pd.DataFrame: "metal", ] + interaction_list = pd.DataFrame() for interaction_type in interaction_types: tmp_interaction = self.create_df_from_binding_site( interactions_by_site[selected_site], interaction_type=interaction_type, ) - tmp_interaction["frame"] = frame - tmp_interaction["interaction_type"] = interaction_type + tmp_interaction["FRAME"] = int(frame) + tmp_interaction["INTERACTION"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) - else: + if os.path.exists(f"processing_frame_{frame}.pdb"): + os.remove(f"processing_frame_{frame}.pdb") + + if self.special is not None: + combi_lig_special = mda.Universe("ligand_special.pdb") + complex = mda.Universe("complex.pdb") + complex_all = complex.select_atoms("all") + result = self.process_frame_special(frame) + results_df = pd.concat(result, ignore_index=True) + results_df = results_df[results_df["LOCATION"] == "protein.sidechain"] + results_df["RESTYPE"] = results_df["RESTYPE"].replace( + ["HIS", "SER", "CYS"], self.lig_name + ) + results_df["LOCATION"] = results_df["LOCATION"].replace( + "protein.sidechain", "ligand" + ) + updated_target_idx = [] + + for index, row in results_df.iterrows(): + ligand_special_int_nr = int(row["TARGET_IDX"]) + ligand_special_int_nr_atom = combi_lig_special.select_atoms( + f"id {ligand_special_int_nr}" + ) + for atom in ligand_special_int_nr_atom: + atom_name = atom.name + # Adjust atom_name based on the specified conditions + if atom_name in ["N", "C", "O", "S"]: + atom_name = f"{atom_name}1" + else: + # Assuming the format is a single letter followed by a number + base_name, atom_number = atom_name[:-1], int(atom_name[-1]) + new_atom_number = atom_number + 1 + atom_name = f"{base_name}{new_atom_number}" + for complex_atom in complex_all: + complex_atom_name = complex_atom.name + if atom_name == complex_atom_name: + true_number = complex_atom.id + break # Exit the loop once a match is found + updated_target_idx.append(true_number) + + # Update 'TARGET_IDX' in interaction_list + results_df["TARGET_IDX"] = updated_target_idx + interaction_list["TARGET_IDX"] = interaction_list["TARGET_IDX"] + + # Concatenate the updated results_df to interaction_list + interaction_list = pd.concat([interaction_list, results_df]) + if self.peptide is not None: interactions_by_site = self.retrieve_plip_interactions_peptide( f"processing_frame_{frame}.pdb" ) - selected_site = list(interactions_by_site.keys())[0] + index_of_selected_site = -1 + selected_site = list(interactions_by_site.keys())[index_of_selected_site] + interaction_types = [ "hydrophobic", "hbond", @@ -290,85 +307,173 @@ def process_frame(self, frame: int) -> pd.DataFrame: "halogen", "metal", ] + + interaction_list = pd.DataFrame() for interaction_type in interaction_types: tmp_interaction = self.create_df_from_binding_site( interactions_by_site[selected_site], interaction_type=interaction_type, ) - tmp_interaction["frame"] = frame - tmp_interaction["interaction_type"] = interaction_type + tmp_interaction["FRAME"] = int(frame) + tmp_interaction["INTERACTION"] = interaction_type interaction_list = pd.concat([interaction_list, tmp_interaction]) + if os.path.exists(f"processing_frame_{frame}.pdb"): + os.remove(f"processing_frame_{frame}.pdb") - os.remove(f"processing_frame_{frame}.pdb") return interaction_list - def process_frame_wrapper(self, frame: int) -> pd.DataFrame: - """Wrapper function to handle processing of a frame, including special cases. + def process_frame_special(self, frame): + """Function extension of process_frame to process special ligands. Args: - frame (int): Frame number to be processed. + frame (int): Number of the frame that will be processed. + pdb_md (mda universe): MDA Universe class representation of the topology and the trajectory of the file that is being processed. + lig_name (str): Name of the ligand in the complex that will be analyzed. + special (str, optional): Name of the special ligand that will be analysed. Defaults to None. Returns: - pd.DataFrame: DataFrame containing interaction data for the processed frame. + list: list of dataframes containing the interaction data for the processed frame with the special ligand. """ - if self.special: - return self.process_frame_special(frame) - return self.process_frame(frame) + res_renaming = ["HIS", "SER", "CYS"] + interaction_dfs = [] + for res in res_renaming: + self.pdb_md.trajectory[frame] + atoms_selected = self.pdb_md.select_atoms( + f"resname {self.lig_name} or resname {self.special}" + ) + atoms_selected.write(f"processing_frame_{frame}.pdb") + self.change_lig_to_residue(f"processing_frame_{frame}.pdb", res) + interactions_by_site = self.retrieve_plip_interactions( + f"processing_frame_{frame}.pdb", self.special + ) + index_of_selected_site = -1 + selected_site = list(interactions_by_site.keys())[index_of_selected_site] + interaction_types = ["metal"] + interaction_list = pd.DataFrame() + for interaction_type in interaction_types: + tmp_interaction = self.create_df_from_binding_site( + interactions_by_site[selected_site], + interaction_type=interaction_type, + ) + tmp_interaction["FRAME"] = int(frame) + tmp_interaction["INTERACTION"] = interaction_type + interaction_list = pd.concat([interaction_list, tmp_interaction]) + interaction_dfs.append(interaction_list) + os.remove(f"processing_frame_{frame}.pdb") + return interaction_dfs - def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame: - """Fill in missing frames in the DataFrame with NaNs. + def process_frame_wrapper(self, args): + """Wrapper for the MD Trajectory procession. Args: - df (pd.DataFrame): Original DataFrame with potentially missing frames. + args (tuple): Tuple containing (frame_idx: int - number of the frame to be processed, pdb_md: mda universe - MDA Universe class representation of the topology and the trajectory of the file that is being processed, lig_name: str - Name of the ligand in the complex that will be analyzed, special_ligand: str - Name of the special ligand that will be analysed, peptide: str - Chainid of the peptide that will be analyzed) Returns: - pd.DataFrame: DataFrame with missing frames filled. + tuple: tuple containing the frame index and the result of from the process_frame function. """ - all_frames = set(range(len(self.pdb_md.trajectory))) - existing_frames = set(df["frame"].unique()) - missing_frames = all_frames - existing_frames - missing_data = [] - - for frame in missing_frames: - for interaction_type in df["interaction_type"].unique(): - missing_data.append( - { - "frame": frame, - "interaction_type": interaction_type, - # Add other necessary columns with NaNs or empty values as appropriate - } - ) + frame_idx, pdb_md, lig_name, special_ligand, peptide = args + + return frame_idx, self.process_frame(frame_idx) - missing_df = pd.DataFrame(missing_data) - filled_df = pd.concat([df, missing_df], ignore_index=True) - return filled_df + def fill_missing_frames(self, df): + """Fills the frames with no interactions in the DataFrame with placeholder values. - def process_trajectory(self) -> pd.DataFrame: - """Main method for processing the trajectory. + Args: + df (pandas dataframe): The input DataFrame with frames that have no Interactions + md_len (int): The value that indicates the number of frames, thus allowing the function to loop through the DataFrame Returns: - pd.DataFrame: A DataFrame containing all interaction data for the trajectory. + pandas dataframe: DataFrame with placeholder values in the frames with no interactions. """ - pool = Pool(processes=self.num_processes) - with tqdm(total=len(self.pdb_md.trajectory)) as pbar: - interaction_list = pd.concat( - list( - tqdm( - pool.imap( - self.process_frame_wrapper, - range(len(self.pdb_md.trajectory)), - ), - total=len(self.pdb_md.trajectory), - ) + + md_len = len(self.pdb_md.trajectory) - 1 + # Create a set containing all unique values in the 'FRAME' column + existing_frames = set(df["FRAME"]) + + # Create a list to store new rows for missing numbers + missing_rows = [] + + # Iterate through numbers from 0 to md_len + for frame_number in range(1, md_len): + if frame_number not in existing_frames: + # Create a new row with 'FRAME' set to the missing number and other columns set to "skip" + missing_row = {"FRAME": frame_number} + for col in df.columns: + if col != "FRAME": + missing_row[col] = "skip" + missing_rows.append(missing_row) + + # Concatenate the missing rows with the original DataFrame + df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True) + + # Sort the DataFrame by the 'FRAME' column + df.sort_values(by="FRAME", inplace=True) + + return df + + def process_trajectory(self): + """Process protein-ligand trajectory with multiple CPUs in parallel. + + Args: + pdb_md (mda universe): MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed. + dataframe (str): Name of a CSV file as str, where the interaction data will be read from if not None. + num_processes (int): The number of CPUs that will be used for the processing of the protein-ligand trajectory. Defaults to half of the CPUs in the system. + lig_name (str): Name of the Ligand in the complex that will be analyzed. + special_ligand (str): Name of the special ligand in the complex that will be analyzed. + peptide (str): Chainid of the peptide that will be analyzed. + + Returns: + pandas dataframe: A DataFrame containing all the protein-ligand interaction data from the whole trajectory. + """ + if self.dataframe is None: + print("\033[1mProcessing protein-ligand trajectory\033[0m") + print(f"\033[1mUsing {self.num_processes} CPUs\033[0m") + total_frames = len(self.pdb_md.trajectory) - 1 + + with Pool(processes=self.num_processes) as pool: + frame_args = [ + (i, self.pdb_md, self.lig_name, self.special, self.peptide) + for i in range(1, total_frames + 1) + ] + + # Initialize the progress bar with the total number of frames + pbar = tqdm( + total=total_frames, + ascii=True, + desc="\033[1mAnalyzing frames\033[0m", ) - ) - pool.close() - pool.join() - if self.dataframe: - interaction_list.to_csv(self.dataframe) + results = [] + for result in pool.imap(self.process_frame_wrapper, frame_args): + results.append(result) + pbar.update(1) # Update the progress manually + + # Close the progress bar + pbar.close() + + # Extract the results and sort them by frame index + results.sort(key=lambda x: x[0]) + interaction_lists = [result[1] for result in results] + + interaction_list = pd.concat(interaction_lists) + + interaction_list.to_csv("interactions_gathered.csv") + + elif self.dataframe is not None: + print(f"\033[1mGathering data from {self.dataframe}\033[0m") + interaction_tmp = pd.read_csv(self.dataframe) + interaction_list = interaction_tmp.drop(interaction_tmp.columns[0], axis=1) + + interaction_list["Prot_partner"] = ( + interaction_list["RESNR"].astype(str) + + interaction_list["RESTYPE"] + + interaction_list["RESCHAIN"] + ) + + interaction_list = self.fill_missing_frames( + interaction_list, + ) - # Ensure all frames are represented in the final DataFrame - interaction_list = self.fill_missing_frames(interaction_list) + print("\033[1mProtein-ligand trajectory processed\033[0m") return interaction_list From 5c9603ce47ac801cddb9c10eef5a816c41697287 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:54:26 +0100 Subject: [PATCH 21/23] Update pharmacophore.py adjusted --- openmmdl/openmmdl_analysis/pharmacophore.py | 69 ++++++++++----------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py index 76d00263..d0f684ef 100644 --- a/openmmdl/openmmdl_analysis/pharmacophore.py +++ b/openmmdl/openmmdl_analysis/pharmacophore.py @@ -216,7 +216,7 @@ def generate_md_pharmacophore_cloudcenters( feature_type = feature_types[interaction] if interaction in ["Acceptor_hbond", "Donor_hbond"]: pharm = self.generate_pharmacophore_vectors( - self.df_all.filter(regex=interaction).columns.tolist() + self.df_all.filter(regex=interaction).columns ) for feature_name, position in pharm.items(): feature_id_counter += 1 @@ -273,7 +273,7 @@ def generate_md_pharmacophore_cloudcenters( ) elif interaction in ["hydrophobic", "PI_saltbridge", "NI_saltbridge"]: pharm = self.generate_pharmacophore_centers( - self.df_all.filter(regex=interaction).columns.tolist() + self.df_all.filter(regex=interaction).columns ) for feature_name, position in pharm.items(): feature_id_counter += 1 @@ -298,7 +298,7 @@ def generate_md_pharmacophore_cloudcenters( ) elif interaction == "pistacking": pharm = self.generate_pharmacophore_vectors( - self.df_all.filter(regex=interaction).columns.tolist() + self.df_all.filter(regex=interaction).columns ) feature_id_counter += 1 lig_loc = position[0] @@ -375,22 +375,21 @@ def generate_bindingmode_pharmacophore( pharmacophoreType="LIGAND_SCOUT", ) - for interaction, coords in dict_bindingmode.items(): - # Get feature type - feature_type = next( - (ft for it, ft in feature_types.items() if it in interaction), None - ) - if feature_type is None: - continue + for interaction in dict_bindingmode.keys(): + # get feature type + for interactiontype in feature_types.keys(): + if interactiontype in interaction: + feature_type = feature_types[interactiontype] + break # Generate vector features if feature_type in ["HBA", "HBD"]: if feature_type == "HBA": - orig_loc = coords["PROTCOO"][0] - targ_loc = coords["LIGCOO"][0] + orig_loc = dict_bindingmode[interaction]["PROTCOO"][0] + targ_loc = dict_bindingmode[interaction]["LIGCOO"][0] elif feature_type == "HBD": - orig_loc = coords["LIGCOO"][0] - targ_loc = coords["PROTCOO"][0] + orig_loc = dict_bindingmode[interaction]["LIGCOO"][0] + targ_loc = dict_bindingmode[interaction]["PROTCOO"][0] feature_id_counter += 1 points_to_lig = "true" if feature_type == "HBA" else "false" hasSyntheticProjectedPoint = "false" @@ -404,7 +403,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.ligand_name, + coreCompound=self.complex_name, id=f"feature{str(feature_id_counter)}", ) origin = ET.SubElement( @@ -425,7 +424,7 @@ def generate_bindingmode_pharmacophore( ) # Generate point features elif feature_type in ["H", "PI", "NI"]: - position = coords["LIGCOO"][0] + position = dict_bindingmode[interaction]["LIGCOO"][0] feature_id_counter += 1 point = ET.SubElement( pharmacophore, @@ -435,7 +434,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.ligand_name, + coreCompound=self.complex_name, id=f"feature{str(feature_id_counter)}", ) position = ET.SubElement( @@ -449,8 +448,8 @@ def generate_bindingmode_pharmacophore( # Generate plane features elif feature_type == "AR": feature_id_counter += 1 - lig_loc = coords["LIGCOO"][0] - prot_loc = coords["PROTCOO"][0] + lig_loc = dict_bindingmode[interaction]["LIGCOO"][0] + prot_loc = dict_bindingmode[interaction]["PROTCOO"][0] # Normalize vector of plane vector = np.array(lig_loc) - np.array(prot_loc) @@ -465,7 +464,7 @@ def generate_bindingmode_pharmacophore( optional="false", disabled="false", weight="1.0", - coreCompound=self.ligand_name, + coreCompound=self.complex_name, id=f"feature{str(feature_id_counter)}", ) position = ET.SubElement( @@ -526,25 +525,25 @@ def generate_point_cloud_pml(self, outname: str) -> None: """ cloud_dict = {} cloud_dict["H"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="hydrophobic").columns.tolist() + self.df_all.filter(regex="hydrophobic").columns ) cloud_dict["HBA"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="Acceptor_hbond").columns.tolist() + self.df_all.filter(regex="Acceptor_hbond").columns ) cloud_dict["HBD"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="Donor_hbond").columns.tolist() + self.df_all.filter(regex="Donor_hbond").columns ) cloud_dict["AR"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="pistacking").columns.tolist() + self.df_all.filter(regex="pistacking").columns ) cloud_dict["PI"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="PI_saltbridge").columns.tolist() + self.df_all.filter(regex="PI_saltbridge").columns ) cloud_dict["NI"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="NI_saltbridge").columns.tolist() + self.df_all.filter(regex="NI_saltbridge").columns ) cloud_dict["M"] = self.generate_pharmacophore_centers_all_points( - self.df_all.filter(regex="metal").columns.tolist() + self.df_all.filter(regex="metal").columns ) pharmacophore = ET.Element( @@ -554,9 +553,9 @@ def generate_point_cloud_pml(self, outname: str) -> None: pharmacophoreType="LIGAND_SCOUT", ) feature_id_counter = 0 - for feature_type, interactions in cloud_dict.items(): - for interaction, points in interactions.items(): - if len(points) > 1: + for feature_type in cloud_dict.keys(): + for interaction in cloud_dict[feature_type].keys(): + if len(cloud_dict[feature_type][interaction]) > 1: feature_cloud = ET.SubElement( pharmacophore, "featureCloud", @@ -570,12 +569,12 @@ def generate_point_cloud_pml(self, outname: str) -> None: position = ET.SubElement( feature_cloud, "position", - x3=str(points[0][0]), - y3=str(points[0][1]), - z3=str(points[0][2]), + x3=str(cloud_dict[feature_type][interaction][0][0]), + y3=str(cloud_dict[feature_type][interaction][0][1]), + z3=str(cloud_dict[feature_type][interaction][0][2]), ) - for additional_point in points[1:]: - additional_point_element = ET.SubElement( + for additional_point in cloud_dict[feature_type][interaction][1:]: + additional_point = ET.SubElement( feature_cloud, "additionalPoint", x3=str(round(additional_point[0], 2)), From 0dc21e11061ef5a4aa9acd501831604e84a8078e Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:57:49 +0100 Subject: [PATCH 22/23] Delete openmmdl/openmmdl_analysis/visualization.ipynb --- .../openmmdl_analysis/visualization.ipynb | 94 ------------------- 1 file changed, 94 deletions(-) delete mode 100644 openmmdl/openmmdl_analysis/visualization.ipynb diff --git a/openmmdl/openmmdl_analysis/visualization.ipynb b/openmmdl/openmmdl_analysis/visualization.ipynb deleted file mode 100644 index ab4e7b09..00000000 --- a/openmmdl/openmmdl_analysis/visualization.ipynb +++ /dev/null @@ -1,94 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from openmmdl.openmmdl_analysis.visualization_functions import Visualizer\n", - "import MDAnalysis as mda" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "ligname = \"UNK\" # add the name of the ligand from the original PDB File\n", - "special = None # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n", - "md = mda.Universe(\"PATH_TO_interacting_waters.pdb\", \"PATH_TO_interacting_waters.dcd\")\n", - "cloud = \"PATH_TO_clouds.json\"\n", - "\n", - "visualizer = Visualizer(md, cloud, ligname, special)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "09acfdb0e59f46d481905cef43c231c0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "ThemeManager()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4b33c5fd6d6541ed8b9766bdbfe27e42", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "NGLWidget(gui_style='ngl', layout=Layout(height='1000px', width='1000px'), max_frame=9999)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Predefined variables: receptor_type=\"protein or nucleic\", height=\"1000px\", width=\"1000px\"\n", - "view = visualizer.visualize()\n", - "view.display(gui=True, style=\"ngl\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - }, - "vscode": { - "interpreter": { - "hash": "e021570d17280ebc37ed8e9fc470ea25281789feeed8bd826cec564b6552ec2d" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From dc5b15464522b979a0c7bce2190fc14e34a1dfc1 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:09:56 +0100 Subject: [PATCH 23/23] Update rdkit_figure_generation.py adjusted --- .../rdkit_figure_generation.py | 202 ++++++++++-------- 1 file changed, 111 insertions(+), 91 deletions(-) diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py index 694b0337..94860543 100644 --- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py +++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py @@ -186,93 +186,119 @@ def highlight_numbers( numeric_codes = parts[1:-1] interaction_type = parts[-1] - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = complex_id[0].name if len(complex_id) > 0 else None - - if atom_name is None: - continue - - lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None - - if lig_real_index is None: - continue - - if interaction_type == "hbond": - type = parts[-2] + if interaction_type == "hbond": + parts = item.split() + protein_partner_name = parts[0] + numeric_codes = parts[1:-2] + type = parts[-2] + interaction_type = parts[-1] + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id if type == "Donor": - highlighted_hbond_donor.append(lig_real_index) + highlighted_hbond_donor.append(lig_real_index - 1) elif type == "Acceptor": - highlighted_hbond_acceptor.append(lig_real_index) - - elif interaction_type == "hydrophobic": - highlighted_hydrophobic.append(lig_real_index) - - elif interaction_type == "waterbridge": - highlighted_waterbridge.append(lig_real_index) - - elif interaction_type == "pistacking": + highlighted_hbond_acceptor.append(lig_real_index - 1) + + elif interaction_type == "hydrophobic": + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_hydrophobic.append(lig_real_index - 1) + + elif interaction_type == "waterbridge": + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_waterbridge.append(lig_real_index - 1) + + elif interaction_type == "pistacking": + split_codes = numeric_codes[0].split(",") + for code in split_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_pistacking.append(lig_real_index - 1) + + elif interaction_type == "halogen": + numeric_codes = parts[1:-2] + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_halogen.append(lig_real_index - 1) + + elif interaction_type == "saltbridge": + numeric_codes = parts[1:-3] + saltbridge_type = parts[-2] + if saltbridge_type == "NI": split_codes = numeric_codes[0].split(",") for code in split_codes: atom_index = int(code) complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = complex_id[0].name if len(complex_id) > 0 else None - if atom_name is None: - continue - lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = ( - lig_atom[0].id - 1 if len(lig_atom) > 0 else None - ) - if lig_real_index is not None: - highlighted_pistacking.append(lig_real_index) - - elif interaction_type == "halogen": - highlighted_halogen.append(lig_real_index) - - elif interaction_type == "saltbridge": - saltbridge_type = parts[-2] - if saltbridge_type == "NI": - split_codes = numeric_codes[0].split(",") - for code in split_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = ( - complex_id[0].name if len(complex_id) > 0 else None - ) - if atom_name is None: - continue - lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = ( - lig_atom[0].id - 1 if len(lig_atom) > 0 else None - ) - if lig_real_index is not None: - highlighted_ni.append(lig_real_index) - - elif saltbridge_type == "PI": - for code in numeric_codes: - atom_index = int(code) - complex_id = self.complex.select_atoms(f"id {atom_index}") - atom_name = ( - complex_id[0].name if len(complex_id) > 0 else None - ) - if atom_name is None: - continue - lig_atom = self.lig_noh.select_atoms(f"name {atom_name}") - lig_real_index = ( - lig_atom[0].id - 1 if len(lig_atom) > 0 else None - ) - if lig_real_index is not None: - highlighted_pi.append(lig_real_index) - - elif interaction_type == "pication": - highlighted_pication.append(lig_real_index) - - elif interaction_type == "metal": - highlighted_metal.append(lig_real_index) - - # Identify common indices between hbond_donor and hbond_acceptor + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_ni.append(lig_real_index - 1) + elif saltbridge_type == "PI": + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_pi.append(lig_real_index - 1) + + elif interaction_type == "pication": + numeric_codes = parts[1:-2] + for code in numeric_codes: + atom_index = int(code) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_pication.append(lig_real_index - 1) + + elif interaction_type == "metal": + ligidx = parts[1] + atom_index = int(ligidx) + complex_id = self.complex.select_atoms(f"id {atom_index}") + for atom in complex_id: + atom_name = atom.name + for lig_atom in self.lig_noh: + if lig_atom.name == atom_name: + lig_real_index = lig_atom.id + highlighted_metal.append(lig_real_index - 1) + for value in highlighted_hbond_donor[:]: if value in highlighted_hbond_acceptor: highlighted_hbond_donor.remove(value) @@ -337,15 +363,9 @@ def update_dict( """ for source_dict in source_dicts: for key, value in source_dict.items(): - if key in target_dict: - # Combine the RGB values (optional approach; adjust as needed) - existing_value = target_dict[key] - target_dict[key] = tuple( - min(1.0, max(0.0, (existing_value[i] + value[i]) / 2)) - for i in range(3) - ) - else: - target_dict[key] = value + int_key = int(key) + if int_key not in target_dict: + target_dict[int_key] = value class ImageMerger: @@ -385,7 +405,7 @@ def create_and_merge_images(self) -> List[str]: label = data.split()[-1] type_ = data.split()[ -2 - ] # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function + ] if label == "hydrophobic": (line,) = ax.plot( x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0