From 39e9cce778708411e86e3f8362b71a518b308ab0 Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 13:02:55 +0200
Subject: [PATCH 01/23] Update visualization_functions.py

added type checking and hinting
---
 .../visualization_functions.py                | 58 +++++++++++--------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py
index f7981118..4b4f8441 100644
--- a/openmmdl/openmmdl_analysis/visualization_functions.py
+++ b/openmmdl/openmmdl_analysis/visualization_functions.py
@@ -6,30 +6,31 @@
 import subprocess
 import os
 import shutil
+from typing import List, Dict, Any, Optional, Union
 from openmmdl.openmmdl_analysis.barcode_generation import BarcodeGenerator
 
 
 class TrajectorySaver:
-    def __init__(self, pdb_md, ligname, special, nucleic):
-        """Initializes the TrajectorySaver with a mda.Universe object, ligand name, special residue name and if receptor is nucleic.
+    def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool) -> None:
+        """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type.
 
         Args:
-            pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory
-            ligname (str): name of the ligand in the pdb file
-            special (str): name of the special residue/ligand in the pdb file (e.g. HEM)
-            nucleic (bool): True if receptor is nucleic, False otherwise
+            pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory.
+            ligname (str): Name of the ligand in the pdb file.
+            special (str): Name of the special residue/ligand in the pdb file (e.g., HEM).
+            nucleic (bool): True if the receptor is nucleic, False otherwise.
         """
         self.pdb_md = pdb_md
         self.ligname = ligname
         self.special = special
         self.nucleic = nucleic
 
-    def save_interacting_waters_trajectory(self, interacting_waters, outputpath):
-        """Saves .pdb and .dcd files of the trajectory containing ligand, receptor and all interacting waters.
+    def save_interacting_waters_trajectory(self, interacting_waters: List[int], outputpath: str = './Visualization/') -> None:
+        """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters.
 
         Args:
-            interacting_waters (list): list of all interacting water ids
-            outputpath (str, optional): filepath to output new pdb and dcd files. Defaults to './Visualization/'.
+            interacting_waters (List[int]): List of all interacting water IDs.
+            outputpath (str, optional): Filepath to output new pdb and dcd files. Defaults to './Visualization/'.
         """
         water_atoms = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.ligname} or resname {self.special}"
@@ -47,13 +48,13 @@ def save_interacting_waters_trajectory(self, interacting_waters, outputpath):
             for ts in self.pdb_md.trajectory:
                 W.write(water_atoms)
 
-    def save_frame(self, frame, outpath, selection=False):
+    def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None) -> None:
         """Saves a single frame of the trajectory.
 
         Args:
-            frame (int): Number of the frame to save
-            outpath (str): Path to save the frame to
-            selection (str, optional): MDAnalysis selection string. Defaults to False.
+            frame (int): Number of the frame to save.
+            outpath (str): Path to save the frame to.
+            selection (Optional[str], optional): MDAnalysis selection string. Defaults to None.
         """
         self.pdb_md.trajectory[frame]
         if selection:
@@ -64,30 +65,37 @@ def save_frame(self, frame, outpath, selection=False):
 
 
 class Visualizer:
-    def __init__(self, md, cloud_path, ligname, special):
+    def __init__(self, md: mda.Universe, cloud_path: str, ligname: str, special: Optional[str] = None) -> None:
         self.md = md
         self.cloud = self.load_cloud(cloud_path)
         self.ligname = ligname
         self.special = special
 
-    def load_cloud(self, cloud_path):
+    def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float], List[int]]]]:
+        """Loads interaction cloud data from a JSON file.
+
+        Args:
+            cloud_path (str): Path to the cloud data JSON file.
+
+        Returns:
+            Dict[str, Dict[str, Union[List[float], List[int]]]]: The loaded cloud data.
+        """
         with open(cloud_path, "r") as f:
             data = json.load(f)
         return data
 
     def visualize(
-        self, receptor_type="protein or nucleic", height="1000px", width="1000px"
-    ):
+        self, receptor_type: str = "protein or nucleic", height: str = "1000px", width: str = "1000px"
+    ) -> nv.NGLWidget:
         """Generates visualization of the trajectory with the interacting waters and interaction clouds.
 
         Args:
-            ligname (str): name of the ligand in the pdb file
-            receptor_type (str, optional): type of receptor. Defaults to 'protein or nucleic'.
-            height (str, optional): height of the visualization. Defaults to '1000px'.
-            width (str, optional): width of the visualization. Defaults to '1000px'.
+            receptor_type (str, optional): Type of receptor. Defaults to 'protein or nucleic'.
+            height (str, optional): Height of the visualization. Defaults to '1000px'.
+            width (str, optional): Width of the visualization. Defaults to '1000px'.
 
         Returns:
-            nglview widget: returns an nglview.widget object containing the visualization
+            nv.NGLWidget: Returns an nglview.widget object containing the visualization.
         """
 
         sphere_buffers = []
@@ -141,8 +149,8 @@ def visualize(
         return view
 
 
-def run_visualization():
-    """Runs the visualization notebook in the current directory. The visualization notebook is copied from the package directory to the current directory and automaticaly started."""
+def run_visualization() -> None:
+    """Runs the visualization notebook in the current directory. The visualization notebook is copied from the package directory to the current directory and automatically started."""
     package_dir = os.path.dirname(__file__)
     notebook_path = os.path.join(package_dir, "visualization.ipynb")
     current_dir = os.getcwd()

From 4b8e11ac5dfe1596c87a725ebb43df0226147a5a Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 13:27:07 +0200
Subject: [PATCH 02/23] Update rmsd_calculation.py

added type checking and hinting
---
 .../openmmdl_analysis/rmsd_calculation.py     | 73 +++++++++----------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py
index 60a650da..cbcc4505 100644
--- a/openmmdl/openmmdl_analysis/rmsd_calculation.py
+++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py
@@ -7,10 +7,11 @@
 import MDAnalysis as mda
 from MDAnalysis.analysis import rms, diffusionmap
 from MDAnalysis.analysis.distances import dist
+from typing import Optional, List, Tuple, Union
 
 
 @jit(nopython=True, parallel=True, nogil=True)
-def calc_rmsd_2frames_jit(ref, frame):
+def calc_rmsd_2frames_jit(ref: np.ndarray, frame: np.ndarray) -> float:
     dist = np.zeros(len(frame))
     for atom in range(len(frame)):
         dist[atom] = (
@@ -23,12 +24,14 @@ def calc_rmsd_2frames_jit(ref, frame):
 
 
 class RMSDAnalyzer:
-    def __init__(self, prot_lig_top_file, prot_lig_traj_file):
-        self.prot_lig_top_file = prot_lig_top_file
-        self.prot_lig_traj_file = prot_lig_traj_file
-        self.universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file)
-
-    def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None):
+    def __init__(self, prot_lig_top_file: str, prot_lig_traj_file: str) -> None:
+        self.prot_lig_top_file: str = prot_lig_top_file
+        self.prot_lig_traj_file: str = prot_lig_traj_file
+        self.universe: mda.Universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file)
+
+    def rmsd_for_atomgroups(
+        self, fig_type: str, selection1: str, selection2: Optional[List[str]] = None
+    ) -> pd.DataFrame:
         """Calculate the RMSD for selected atom groups, and save the csv file and plot.
 
         Args:
@@ -45,7 +48,7 @@ def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None):
             self.universe, ref, select=selection1, groupselections=selection2
         )
         rmsd_analysis.run()
-        columns = [selection1, *selection2] if selection2 else [selection1]
+        columns = [selection1] + selection2 if selection2 else [selection1]
         rmsd_df = pd.DataFrame(np.round(rmsd_analysis.rmsd[:, 2:], 2), columns=columns)
         rmsd_df.index.name = "frame"
 
@@ -54,16 +57,18 @@ def rmsd_for_atomgroups(self, fig_type, selection1, selection2=None):
         os.makedirs(output_directory, exist_ok=True)
 
         # Save the RMSD values to a CSV file in the created directory
-        rmsd_df.to_csv("./RMSD/RMSD_over_time.csv", sep=" ")
+        rmsd_df.to_csv(f"{output_directory}/RMSD_over_time.csv", sep=" ")
 
         # Plot and save the RMSD over time as a PNG file
         rmsd_df.plot(title="RMSD of protein and ligand")
         plt.ylabel("RMSD (Å)")
-        plt.savefig(f"./RMSD/RMSD_over_time.{fig_type}")
+        plt.savefig(f"{output_directory}/RMSD_over_time.{fig_type}")
 
         return rmsd_df
 
-    def rmsd_dist_frames(self, fig_type, lig, nucleic=False):
+    def rmsd_dist_frames(
+        self, fig_type: str, lig: str, nucleic: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """Calculate the RMSD between all frames in a matrix.
 
         Args:
@@ -76,24 +81,24 @@ def rmsd_dist_frames(self, fig_type, lig, nucleic=False):
             np.array: pairwise_rmsd_lig. Numpy array of RMSD values for ligand structures.
         """
         if nucleic:
-            pairwise_rmsd_prot = (
+            pairwise_rmsd_prot: np.ndarray = (
                 diffusionmap.DistanceMatrix(self.universe, select="nucleic")
                 .run()
                 .dist_matrix
             )
         else:
-            pairwise_rmsd_prot = (
+            pairwise_rmsd_prot: np.ndarray = (
                 diffusionmap.DistanceMatrix(self.universe, select="protein")
                 .run()
                 .dist_matrix
             )
-        pairwise_rmsd_lig = (
+        pairwise_rmsd_lig: np.ndarray = (
             diffusionmap.DistanceMatrix(self.universe, f"resname {lig}")
             .run()
             .dist_matrix
         )
 
-        max_dist = max(np.amax(pairwise_rmsd_lig), np.amax(pairwise_rmsd_prot))
+        max_dist: float = max(np.amax(pairwise_rmsd_lig), np.amax(pairwise_rmsd_prot))
 
         fig, ax = plt.subplots(1, 2)
         fig.suptitle("RMSD between the frames")
@@ -119,14 +124,14 @@ def rmsd_dist_frames(self, fig_type, lig, nucleic=False):
         plt.savefig(f"./RMSD/RMSD_between_the_frames.{fig_type}")
         return pairwise_rmsd_prot, pairwise_rmsd_lig
 
-    def calc_rmsd_2frames(self, ref, frame):
+    def calc_rmsd_2frames(self, ref: np.ndarray, frame: np.ndarray) -> float:
         """
         RMSD calculation between a reference and a frame.
         """
         return calc_rmsd_2frames_jit(ref, frame)
 
-    def calculate_distance_matrix(self, selection):
-        distances = np.zeros(
+    def calculate_distance_matrix(self, selection: str) -> np.ndarray:
+        distances: np.ndarray = np.zeros(
             (len(self.universe.trajectory), len(self.universe.trajectory))
         )
         # calculate distance matrix
@@ -135,44 +140,36 @@ def calculate_distance_matrix(self, selection):
             desc="\033[1mCalculating distance matrix:\033[0m",
         ):
             self.universe.trajectory[i]
-            frame_i = self.universe.select_atoms(selection).positions
-            # distances[i] = md.rmsd(traj_aligned, traj_aligned, frame=i)
+            frame_i: np.ndarray = self.universe.select_atoms(selection).positions
             for j in range(i + 1, len(self.universe.trajectory)):
                 self.universe.trajectory[j]
-                frame_j = self.universe.select_atoms(selection).positions
-                rmsd = self.calc_rmsd_2frames(frame_i, frame_j)
+                frame_j: np.ndarray = self.universe.select_atoms(selection).positions
+                rmsd: float = self.calc_rmsd_2frames(frame_i, frame_j)
                 distances[i][j] = rmsd
                 distances[j][i] = rmsd
         return distances
 
-    def calculate_representative_frame(self, bmode_frames, DM):
-        """Calculates the most representative frame for a bindingmode. This is based uppon the averagwe RMSD of a frame to all other frames in the binding mode.
+    def calculate_representative_frame(
+        self, bmode_frames: List[int], DM: np.ndarray
+    ) -> int:
+        """Calculates the most representative frame for a binding mode. This is based upon the average RMSD of a frame to all other frames in the binding mode.
 
         Args:
-            bmode_frame_list (list): List of frames belonging to a binding mode.
+            bmode_frames (list): List of frames belonging to a binding mode.
             DM (np.array): Distance matrix of trajectory.
 
         Returns:
             int: Number of the most representative frame.
         """
-        frames = bmode_frames
-        mean_rmsd_per_frame = {}
-        # first loop  : first frame
+        frames: List[int] = bmode_frames
+        mean_rmsd_per_frame: dict = {}
         for frame_i in frames:
             mean_rmsd_per_frame[frame_i] = 0
-            # we will add the rmsd between theses 2 frames and then calcul the
-            # mean
             for frame_j in frames:
-                # We don't want to calcul the same frame.
-                if not frame_j == frame_i:
-                    # we add to the corresponding value in the list of all rmsd
-                    # the RMSD betwween frame_i and frame_j
+                if frame_j != frame_i:
                     mean_rmsd_per_frame[frame_i] += DM[frame_i - 1, frame_j - 1]
-            # mean calculation
             mean_rmsd_per_frame[frame_i] /= len(frames)
 
-            # Representative frame = frame with lower RMSD between all other
-            # frame of the cluster
-            repre = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get)
+        repre: int = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get)
 
         return repre

From 4193611a02a73bd47bfb92edb8b2ded4eff88e69 Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 13:55:28 +0200
Subject: [PATCH 03/23] Update rdkit_figure_generation.py

added type checking and type hinting
---
 .../rdkit_figure_generation.py                | 337 +++++++++---------
 1 file changed, 162 insertions(+), 175 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
index e435f9a2..d630f30a 100644
--- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
+++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
@@ -12,13 +12,13 @@
 class LigandImageGenerator:
     def __init__(
         self,
-        ligand_name,
-        complex_pdb_file,
-        ligand_no_h_pdb_file,
-        smiles_file,
-        output_svg_filename,
-        fig_type="svg",
-    ):
+        ligand_name: str,
+        complex_pdb_file: str,
+        ligand_no_h_pdb_file: str,
+        smiles_file: str,
+        output_svg_filename: str,
+        fig_type: str = "svg",
+    ) -> None:
         """
         Initialize the LigandImageGenerator class.
 
@@ -30,14 +30,14 @@ def __init__(
             output_svg_filename (str): Name of the output SVG file.
             fig_type (str): Type of the output figure. Can be "svg" or "png".
         """
-        self.ligand_name = ligand_name
-        self.complex_pdb_file = complex_pdb_file
-        self.ligand_no_h_pdb_file = ligand_no_h_pdb_file
-        self.smiles_file = smiles_file
-        self.output_svg_filename = output_svg_filename
-        self.fig_type = fig_type
-
-    def generate_image(self):
+        self.ligand_name: str = ligand_name
+        self.complex_pdb_file: str = complex_pdb_file
+        self.ligand_no_h_pdb_file: str = ligand_no_h_pdb_file
+        self.smiles_file: str = smiles_file
+        self.output_svg_filename: str = output_svg_filename
+        self.fig_type: str = fig_type
+
+    def generate_image(self) -> None:
         """Generates an SVG image (or PNG) of the ligand."""
         try:
             # Load complex and ligand structures
@@ -47,31 +47,31 @@ def generate_image(self):
             complex_lig = complex.select_atoms(f"resname {self.ligand_name}")
 
             # Load ligand from PDB file
-            mol = Chem.MolFromPDBFile(self.ligand_no_h_pdb_file)
-            lig_rd = mol
+            mol: Chem.Mol = Chem.MolFromPDBFile(self.ligand_no_h_pdb_file)
+            lig_rd: Chem.Mol = mol
 
             # Load reference SMILES
             with open(self.smiles_file, "r") as file:
-                reference_smiles = file.read().strip()
-            reference_mol = Chem.MolFromSmiles(reference_smiles)
+                reference_smiles: str = file.read().strip()
+            reference_mol: Chem.Mol = Chem.MolFromSmiles(reference_smiles)
 
             # Prepare ligand
-            prepared_ligand = AllChem.AssignBondOrdersFromTemplate(
+            prepared_ligand: Chem.Mol = AllChem.AssignBondOrdersFromTemplate(
                 reference_mol, lig_rd
             )
             AllChem.Compute2DCoords(prepared_ligand)
 
             # Map atom indices between ligand_no_h and complex
             for atom in prepared_ligand.GetAtoms():
-                atom_index = atom.GetIdx()
+                atom_index: int = atom.GetIdx()
                 for lig_atom in lig_noh:
-                    lig_index = lig_atom.index
+                    lig_index: int = lig_atom.index
                     if atom_index == lig_index:
-                        lig_atom_name = lig_atom.name
+                        lig_atom_name: str = lig_atom.name
                         for comp_lig in complex_lig:
-                            comp_lig_name = comp_lig.name
+                            comp_lig_name: str = comp_lig.name
                             if lig_atom_name == comp_lig_name:
-                                num = int(comp_lig.id)
+                                num: int = int(comp_lig.id)
                                 atom.SetAtomMapNum(num)
 
             # Generate an SVG image of the ligand
@@ -82,13 +82,13 @@ def generate_image(self):
             drawer.DrawMolecule(prepared_ligand)
 
             # Adjust font size in the SVG output using the FontSize method
-            font_size = drawer.FontSize()
+            font_size: float = drawer.FontSize()
             drawer.SetFontSize(
                 font_size * 0.5
             )  # You can adjust the multiplier as needed
 
             drawer.FinishDrawing()
-            svg = drawer.GetDrawingText().replace("svg:", "")
+            svg: str = drawer.GetDrawingText().replace("svg:", "")
 
             # Save the SVG image to the specified output file
             with open(self.output_svg_filename, "w") as f:
@@ -96,7 +96,7 @@ def generate_image(self):
 
             # Convert to PNG if requested
             if self.fig_type == "png":
-                png_filename = self.output_svg_filename.replace(".svg", ".png")
+                png_filename: str = self.output_svg_filename.replace(".svg", ".png")
                 cairosvg.svg2png(url=self.output_svg_filename, write_to=png_filename)
                 print(f"PNG image saved as: {png_filename}")
 
@@ -104,8 +104,11 @@ def generate_image(self):
             print(f"Error: {e}")
 
 
+import MDAnalysis as mda
+from typing import List, Dict, Tuple
+
 class InteractionProcessor:
-    def __init__(self, complex_pdb_file, ligand_no_h_pdb_file):
+    def __init__(self, complex_pdb_file: str, ligand_no_h_pdb_file: str):
         """
         Initialize the InteractionProcessor class.
 
@@ -119,14 +122,14 @@ def __init__(self, complex_pdb_file, ligand_no_h_pdb_file):
         self.ligand_no_h = mda.Universe(ligand_no_h_pdb_file)
         self.lig_noh = self.ligand_no_h.select_atoms("all")
 
-    def split_interaction_data(self, data):
+    def split_interaction_data(self, data: List[str]) -> List[str]:
         """Splits the input data into multiple parts.
 
         Args:
-            data (list): A list of ResNr and ResType, Atom indices, interaction type that needs to be split.
+            data (List[str]): A list of ResNr and ResType, Atom indices, interaction type that needs to be split.
 
         Returns:
-            list: A new list of the interaction data that consists of three parts.
+            List[str]: A new list of the interaction data that consists of three parts.
         """
         split_data = []
         for item in data:
@@ -138,28 +141,31 @@ def split_interaction_data(self, data):
             split_data.append(split_value)
         return split_data
 
-    def highlight_numbers(self, split_data, starting_idx):
+    def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> Tuple[
+        List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int]
+    ]:
         """Extracts the data from the split_data output of the interactions and categorizes it to its respective list.
 
         Args:
-            split_data (list): A list of interaction data items, where each item contains information about protein partner name,
+            split_data (List[str]): A list of interaction data items, where each item contains information about protein partner name,
             numeric codes and interaction type.
-            starting_idx (list): Starting index of the ligand atom indices used for identifying the correct atom to highlight.
+            starting_idx (List[int]): Starting index of the ligand atom indices used for identifying the correct atom to highlight.
 
         Returns:
-            tuple: A tuple that contains list of all of the highlighted atoms of all of the interactions.
+            Tuple[List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int]]:
+            A tuple that contains list of all of the highlighted atoms of all of the interactions.
         """
-        highlighted_hbond_acceptor = []
-        highlighted_hbond_donor = []
-        highlighted_hydrophobic = []
-        highlighted_hbond_both = []
-        highlighted_waterbridge = []
-        highlighted_pistacking = []
-        highlighted_halogen = []
-        highlighted_ni = []
-        highlighted_pi = []
-        highlighted_pication = []
-        highlighted_metal = []
+        highlighted_hbond_acceptor: List[int] = []
+        highlighted_hbond_donor: List[int] = []
+        highlighted_hydrophobic: List[int] = []
+        highlighted_hbond_both: List[int] = []
+        highlighted_waterbridge: List[int] = []
+        highlighted_pistacking: List[int] = []
+        highlighted_halogen: List[int] = []
+        highlighted_ni: List[int] = []
+        highlighted_pi: List[int] = []
+        highlighted_pication: List[int] = []
+        highlighted_metal: List[int] = []
 
         for item in split_data:
             parts = item.split()
@@ -167,119 +173,83 @@ def highlight_numbers(self, split_data, starting_idx):
             numeric_codes = parts[1:-1]
             interaction_type = parts[-1]
 
-            if interaction_type == "hbond":
-                parts = item.split()
-                protein_partner_name = parts[0]
-                numeric_codes = parts[1:-2]
-                type = parts[-2]
-                interaction_type = parts[-1]
-                for code in numeric_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
+            for code in numeric_codes:
+                atom_index = int(code)
+                complex_id = self.complex.select_atoms(f"id {atom_index}")
+                atom_name = complex_id[0].name if len(complex_id) > 0 else None
+
+                if atom_name is None:
+                    continue
+
+                lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
+                lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+
+                if lig_real_index is None:
+                    continue
+
+                if interaction_type == "hbond":
+                    type = parts[-2]
                     if type == "Donor":
-                        highlighted_hbond_donor.append(lig_real_index - 1)
+                        highlighted_hbond_donor.append(lig_real_index)
                     elif type == "Acceptor":
-                        highlighted_hbond_acceptor.append(lig_real_index - 1)
-
-            elif interaction_type == "hydrophobic":
-                for code in numeric_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
-                    highlighted_hydrophobic.append(lig_real_index - 1)
-
-            elif interaction_type == "waterbridge":
-                for code in numeric_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
-                    highlighted_waterbridge.append(lig_real_index - 1)
-
-            elif interaction_type == "pistacking":
-                split_codes = numeric_codes[0].split(",")
-                for code in split_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
-                    highlighted_pistacking.append(lig_real_index - 1)
-
-            elif interaction_type == "halogen":
-                numeric_codes = parts[1:-2]
-                for code in numeric_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
-                    highlighted_halogen.append(lig_real_index - 1)
-
-            elif interaction_type == "saltbridge":
-                numeric_codes = parts[1:-3]
-                saltbridge_type = parts[-2]
-                if saltbridge_type == "NI":
+                        highlighted_hbond_acceptor.append(lig_real_index)
+
+                elif interaction_type == "hydrophobic":
+                    highlighted_hydrophobic.append(lig_real_index)
+
+                elif interaction_type == "waterbridge":
+                    highlighted_waterbridge.append(lig_real_index)
+
+                elif interaction_type == "pistacking":
                     split_codes = numeric_codes[0].split(",")
                     for code in split_codes:
                         atom_index = int(code)
                         complex_id = self.complex.select_atoms(f"id {atom_index}")
-                        for atom in complex_id:
-                            atom_name = atom.name
-                        for lig_atom in self.lig_noh:
-                            if lig_atom.name == atom_name:
-                                lig_real_index = lig_atom.id
-                        highlighted_ni.append(lig_real_index - 1)
-                elif saltbridge_type == "PI":
-                    for code in numeric_codes:
-                        atom_index = int(code)
-                        complex_id = self.complex.select_atoms(f"id {atom_index}")
-                        for atom in complex_id:
-                            atom_name = atom.name
-                        for lig_atom in self.lig_noh:
-                            if lig_atom.name == atom_name:
-                                lig_real_index = lig_atom.id
-                        highlighted_pi.append(lig_real_index - 1)
-
-            elif interaction_type == "pication":
-                numeric_codes = parts[1:-2]
-                for code in numeric_codes:
-                    atom_index = int(code)
-                    complex_id = self.complex.select_atoms(f"id {atom_index}")
-                    for atom in complex_id:
-                        atom_name = atom.name
-                    for lig_atom in self.lig_noh:
-                        if lig_atom.name == atom_name:
-                            lig_real_index = lig_atom.id
-                    highlighted_pication.append(lig_real_index - 1)
-
-            elif interaction_type == "metal":
-                ligidx = parts[1]
-                atom_index = int(ligidx)
-                complex_id = self.complex.select_atoms(f"id {atom_index}")
-                for atom in complex_id:
-                    atom_name = atom.name
-                for lig_atom in self.lig_noh:
-                    if lig_atom.name == atom_name:
-                        lig_real_index = lig_atom.id
-                highlighted_metal.append(lig_real_index - 1)
-
+                        atom_name = complex_id[0].name if len(complex_id) > 0 else None
+                        if atom_name is None:
+                            continue
+                        lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
+                        lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                        if lig_real_index is not None:
+                            highlighted_pistacking.append(lig_real_index)
+
+                elif interaction_type == "halogen":
+                    highlighted_halogen.append(lig_real_index)
+
+                elif interaction_type == "saltbridge":
+                    saltbridge_type = parts[-2]
+                    if saltbridge_type == "NI":
+                        split_codes = numeric_codes[0].split(",")
+                        for code in split_codes:
+                            atom_index = int(code)
+                            complex_id = self.complex.select_atoms(f"id {atom_index}")
+                            atom_name = complex_id[0].name if len(complex_id) > 0 else None
+                            if atom_name is None:
+                                continue
+                            lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
+                            lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            if lig_real_index is not None:
+                                highlighted_ni.append(lig_real_index)
+
+                    elif saltbridge_type == "PI":
+                        for code in numeric_codes:
+                            atom_index = int(code)
+                            complex_id = self.complex.select_atoms(f"id {atom_index}")
+                            atom_name = complex_id[0].name if len(complex_id) > 0 else None
+                            if atom_name is None:
+                                continue
+                            lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
+                            lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            if lig_real_index is not None:
+                                highlighted_pi.append(lig_real_index)
+
+                elif interaction_type == "pication":
+                    highlighted_pication.append(lig_real_index)
+
+                elif interaction_type == "metal":
+                    highlighted_metal.append(lig_real_index)
+
+        # Identify common indices between hbond_donor and hbond_acceptor
         for value in highlighted_hbond_donor[:]:
             if value in highlighted_hbond_acceptor:
                 highlighted_hbond_donor.remove(value)
@@ -300,15 +270,15 @@ def highlight_numbers(self, split_data, starting_idx):
             highlighted_metal,
         )
 
-    def generate_interaction_dict(self, interaction_type, keys):
+    def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> Dict[int, Tuple[float, float, float]]:
         """Generates a dictionary of interaction RGB color model based on the provided interaction type.
 
         Args:
             interaction_type (str): The type of the interaction, for example 'hydrophobic'.
-            keys (list): List of the highlighted atoms that display an interaction.
+            keys (List[int]): List of the highlighted atoms that display an interaction.
 
         Returns:
-            dict: A dictionary with the interaction types are associated with their respective RGB color codes.
+            Dict[int, Tuple[float, float, float]]: A dictionary with the interaction types associated with their respective RGB color codes.
         """
         interaction_dict = {
             "hbond_acceptor": (1.0, 0.6, 0.6),
@@ -324,35 +294,52 @@ def generate_interaction_dict(self, interaction_type, keys):
             "metal": (1.0, 0.6, 0.0),
         }
 
-        interaction_dict = {
+        if interaction_type not in interaction_dict:
+            raise ValueError(f"Unknown interaction type: {interaction_type}")
+
+        return {
             int(key): interaction_dict[interaction_type] for key in keys
         }
-        return interaction_dict
 
-    def update_dict(self, target_dict, *source_dicts):
+    def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *source_dicts: Dict[int, Tuple[float, float, float]]):
         """Updates the dictionary with the keys and values from other dictionaries.
 
         Args:
-            target_dict (dict): The dictionary that needs to be updated with new keys and values.
-            source_dicts (dict): One or multiple dictionaries that are used to update the target dictionary with new keys and values.
+            target_dict (Dict[int, Tuple[float, float, float]]): The dictionary that needs to be updated with new keys and values.
+            source_dicts (Dict[int, Tuple[float, float, float]]): One or more dictionaries with keys and values to be merged into the target dictionary.
         """
         for source_dict in source_dicts:
             for key, value in source_dict.items():
-                int_key = int(key)
-                if int_key not in target_dict:
-                    target_dict[int_key] = value
+                if key in target_dict:
+                    # Combine the RGB values (optional approach; adjust as needed)
+                    existing_value = target_dict[key]
+                    target_dict[key] = tuple(
+                        min(1.0, max(0.0, (existing_value[i] + value[i]) / 2))
+                        for i in range(3)
+                    )
+                else:
+                    target_dict[key] = value
+
 
+from typing import List, Optional
+from PIL import Image
+import pylab
+import os
 
 class ImageMerger:
     def __init__(
-        self, binding_mode, occurrence_percent, split_data, merged_image_paths
-    ):
+        self, 
+        binding_mode: str, 
+        occurrence_percent: float, 
+        split_data: List[str], 
+        merged_image_paths: List[str]
+    ) -> None:
         self.binding_mode = binding_mode
         self.occurrence_percent = occurrence_percent
         self.split_data = split_data
         self.merged_image_paths = merged_image_paths
 
-    def create_and_merge_images(self):
+    def create_and_merge_images(self) -> List[str]:
         """Create and merge images to generate a legend for binding modes.
 
         Returns:
@@ -374,17 +361,17 @@ def create_and_merge_images(self):
         for i, data in enumerate(filtered_split_data):
             y = data_points[i]
             label = data.split()[-1]
-            type = data.split()[-2]
+            type_ = data.split()[-2]  # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function
             if label == "hydrophobic":
                 (line,) = ax.plot(
                     x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0
                 )
             elif label == "hbond":
-                if type == "Acceptor":
+                if type_ == "Acceptor":
                     (line,) = ax.plot(
                         x, y, label=data, color=(1.0, 0.6, 0.6), linewidth=5.0
                     )
-                elif type == "Donor":
+                elif type_ == "Donor":
                     (line,) = ax.plot(
                         x, y, label=data, color=(0.3, 0.5, 1.0), linewidth=5.0
                     )
@@ -409,11 +396,11 @@ def create_and_merge_images(self):
                     x, y, label=data, color=(1.0, 0.6, 0.0), linewidth=5.0
                 )
             elif label == "saltbridge":
-                if type == "NI":
+                if type_ == "NI":
                     (line,) = ax.plot(
                         x, y, label=data, color=(0.0, 0.0, 1.0), linewidth=5.0
                     )
-                elif type == "PI":
+                elif type_ == "PI":
                     (line,) = ax.plot(
                         x, y, label=data, color=(1.0, 0.0, 0.0), linewidth=5.0
                     )
@@ -476,11 +463,11 @@ def create_and_merge_images(self):
 
 
 class FigureArranger:
-    def __init__(self, merged_image_paths, output_path):
+    def __init__(self, merged_image_paths: List[str], output_path: str) -> None:
         self.merged_image_paths = merged_image_paths
         self.output_path = output_path
 
-    def arranged_figure_generation(self):
+    def arranged_figure_generation(self) -> None:
         """Generate an arranged figure by arranging merged images in rows and columns."""
         # Open the list of images
         merged_images = [Image.open(path) for path in self.merged_image_paths]

From 70a2142c2dddb15381f5916b153cb0c91767d12e Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:00:28 +0200
Subject: [PATCH 04/23] Update preprocessing.py

added type checking and type hinting
---
 openmmdl/openmmdl_analysis/preprocessing.py | 172 ++++++--------------
 1 file changed, 53 insertions(+), 119 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py
index 15928c92..0c7530b1 100644
--- a/openmmdl/openmmdl_analysis/preprocessing.py
+++ b/openmmdl/openmmdl_analysis/preprocessing.py
@@ -1,189 +1,139 @@
 import MDAnalysis as mda
-import subprocess
 import os
 import re
+from typing import List, Optional
 import rdkit
-import mdtraj as md
 from rdkit import Chem
-from rdkit.Chem import Draw
 from rdkit.Chem import AllChem
-from rdkit.Chem.Draw import rdMolDraw2D
+from rdkit.Chem import Mol
 from openbabel import pybel
-
+import mdtraj as md
 
 class Preprocessing:
     def __init__(self):
         pass
 
-    def renumber_protein_residues(self, input_pdb, reference_pdb, output_pdb):
-        """Renumber protein residues in a molecular dynamics trajectory based on a reference structure.
+    def renumber_protein_residues(
+        self, input_pdb: str, reference_pdb: str, output_pdb: str
+    ) -> None:
+        """
+        Renumber protein residues in a molecular dynamics trajectory based on a reference structure.
 
         Args:
             input_pdb (str): Path to the input PDB file representing the molecular dynamics trajectory to be renumbered.
             reference_pdb (str): Path to the reference PDB file representing the molecular dynamics trajectory used as a reference.
-            output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved..
+            output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved.
         """
-        # Load trajectories
         traj_input = md.load(input_pdb)
         traj_reference = md.load(reference_pdb)
 
-        # Get the topology DataFrames
         input_top_df, _ = traj_input.topology.to_dataframe()
         ref_top_df, _ = traj_reference.topology.to_dataframe()
 
-        # List of common protein residue names including additional residues
         protein_residues = [
-            "ALA",
-            "ARG",
-            "ASN",
-            "ASP",
-            "CYS",
-            "GLN",
-            "GLU",
-            "GLY",
-            "HIS",
-            "ILE",
-            "LEU",
-            "LYS",
-            "MET",
-            "PHE",
-            "PRO",
-            "SER",
-            "THR",
-            "TRP",
-            "TYR",
-            "VAL",
-            "ACE",
-            "NME",
-            "HIE",
-            "HID",
-            "HIP",
+            "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+            "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL",
+            "ACE", "NME", "HIE", "HID", "HIP"
         ]
 
-        # Iterate over each chain in the reference topology
         for chain_id in ref_top_df["chainID"].unique():
-            # Extract residue indices from the reference topology for the current chain
             ref_residue_indices = (
                 ref_top_df[
                     (ref_top_df["chainID"] == chain_id)
                     & ref_top_df["resName"].isin(protein_residues)
-                ]["resSeq"].values
-                - 1
+                ]["resSeq"].values - 1
             )
 
-            # Update residue indices in the input topology DataFrame for the current chain
-            mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(
-                protein_residues
-            )
+            mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(protein_residues)
             input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1
 
-        # Create a new topology from the modified DataFrame
         new_top = md.Topology.from_dataframe(input_top_df)
-
-        # Update the topology in the input trajectory
         new_traj = traj_input.slice(range(traj_input.n_frames))
         new_traj.topology = new_top
-
-        # Save the renumbered trajectory to a new PDB file
         new_traj.save(output_pdb)
 
-    def increase_ring_indices(self, ring, lig_index):
-        """Increases the atom indices in a ring of the ligand obtained from the ligand to fit the atom indices present in the protein-ligand complex.
+    def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]:
+        """
+        Increases the atom indices in a ring of the ligand obtained from the ligand to fit the atom indices present in the protein-ligand complex.
 
         Args:
-            ring (str): A list of atom indices belonging to a ring that need to be modified.
-            lig_index (int): An integer that is the first number of the ligand atom indices obtained from the protein-ligand, which is used to modify the ring indices
+            ring (List[int]): A list of atom indices belonging to a ring that need to be modified.
+            lig_index (int): An integer that is the first number of the ligand atom indices obtained from the protein-ligand, which is used to modify the ring indices.
 
         Returns:
-            list:  A new list with modified atom indicies.
+            List[int]: A new list with modified atom indices.
         """
         return [atom_idx + lig_index for atom_idx in ring]
 
-    def convert_ligand_to_smiles(self, input_sdf, output_smi):
-        """Converts ligand structures from an SDF file to SMILES :) format
+    def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None:
+        """
+        Converts ligand structures from an SDF file to SMILES format.
 
         Args:
-            input_sdf (str): Path to the SDF file with the ligand that wll be converted.
-            output_smi (str): Path to the output SMILES files.
+            input_sdf (str): Path to the SDF file with the ligand that will be converted.
+            output_smi (str): Path to the output SMILES file.
         """
-        # Create a molecule supplier from an SDF file
         mol_supplier = Chem.SDMolSupplier(input_sdf)
-
-        # Open the output SMILES file for writing
         with open(output_smi, "w") as output_file:
-            # Iterate through molecules and convert each to SMILES
             for mol in mol_supplier:
-                if mol is not None:  # Make sure the molecule was successfully read
+                if mol is not None:
                     smiles = Chem.MolToSmiles(mol)
                     output_file.write(smiles + "\n")
                 else:
-                    print("nono")
+                    print("Molecule could not be read.")
 
-    def process_pdb_file(self, input_pdb_filename):
-        """Process a PDB file to make it compatible with the openmmdl_analysis package.
+    def process_pdb_file(self, input_pdb_filename: str) -> None:
+        """
+        Process a PDB file to make it compatible with the openmmdl_analysis package.
 
         Args:
-            input_pdb_filename (str): path to the input PDB file
+            input_pdb_filename (str): Path to the input PDB file.
         """
-        # Load the PDB file
         u = mda.Universe(input_pdb_filename)
-
-        # Iterate through the topology to modify residue names
         for atom in u.atoms:
             resname = atom.resname
-
-            # Check if the residue name is one of the specified water names
             if resname in ["SPC", "TIP3", "TIP4", "WAT", "T3P", "T4P", "T5P"]:
                 atom.residue.resname = "HOH"
             elif resname == "*":
                 atom.residue.resname = "UNK"
-
-        # Save the modified topology to a new PDB file
         u.atoms.write(input_pdb_filename)
 
     def extract_and_save_ligand_as_sdf(
-        self, input_pdb_filename, output_filename, target_resname
-    ):
-        """Extract and save the ligand from the receptor ligand complex PDB file into a new PDB file by itself.
+        self, input_pdb_filename: str, output_filename: str, target_resname: str
+    ) -> None:
+        """
+        Extract and save the ligand from the receptor-ligand complex PDB file into a new SDF file.
 
         Args:
-            input_pdb_filename (str): name of the input PDB file
-            output_pdb_filename (str): name of the output SDF file
-            target_resname (str): resname of the ligand in the original PDB file
+            input_pdb_filename (str): Name of the input PDB file.
+            output_filename (str): Name of the output SDF file.
+            target_resname (str): Residue name of the ligand in the original PDB file.
         """
-        # Load the PDB file using MDAnalysis
         u = mda.Universe(input_pdb_filename)
-
-        # Find the ligand by its residue name
         ligand_atoms = u.select_atoms(f"resname {target_resname}")
 
         if len(ligand_atoms) == 0:
-            print(
-                f"No ligand with residue name '{target_resname}' found in the PDB file."
-            )
+            print(f"No ligand with residue name '{target_resname}' found in the PDB file.")
             return
 
-        # Create a new Universe with only the ligand
         ligand_universe = mda.Merge(ligand_atoms)
-
-        # Save the ligand Universe as a PDB file
         ligand_universe.atoms.write("lig.pdb")
 
-        # use openbabel to convert pdb to sdf
         mol = next(pybel.readfile("pdb", "lig.pdb"))
         mol.write("sdf", output_filename, overwrite=True)
-        # remove the temporary pdb file
         os.remove("lig.pdb")
 
-    def renumber_atoms_in_residues(self, input_pdb_file, output_pdb_file, lig_name):
-        """Renumer the atoms of the ligand in the topology PDB file.
+    def renumber_atoms_in_residues(
+        self, input_pdb_file: str, output_pdb_file: str, lig_name: str
+    ) -> None:
+        """
+        Renumber the atoms of the ligand in the topology PDB file.
 
         Args:
             input_pdb_file (str): Path to the initial PDB file.
             output_pdb_file (str): Path to the output PDB file.
             lig_name (str): Name of the ligand in the input PDB file.
         """
-        # Read the input PDB file
         with open(input_pdb_file, "r") as f:
             pdb_lines = f.readlines()
 
@@ -192,40 +142,25 @@ def renumber_atoms_in_residues(self, input_pdb_file, output_pdb_file, lig_name):
 
         for line in pdb_lines:
             if line.startswith("ATOM"):
-                # Extract information from the ATOM line
                 atom_serial = int(line[6:11])
                 atom_name = line[12:16].strip()
                 residue_name = line[17:20].strip()
 
-                # Check if the residue is residue_name
                 if residue_name == lig_name:
-                    # Extract the element from the atom name (assuming it starts with a capital letter)
                     element_match = re.match(r"([A-Z]+)", atom_name)
-                    if element_match:
-                        element = element_match.group(1)
-                    else:
-                        element = atom_name
-
-                    # Increment the count for the current element in the current residue_name residue
-                    lig_residue_elements[element] = (
-                        lig_residue_elements.get(element, 0) + 1
-                    )
-
-                    # Update the atom name based on the element count
+                    element = element_match.group(1) if element_match else atom_name
+                    lig_residue_elements[element] = lig_residue_elements.get(element, 0) + 1
                     new_atom_name = f"{element}{lig_residue_elements[element]}"
-
-                    # Replace the old atom name with the new one
                     line = line[:12] + f"{new_atom_name:4}" + line[16:]
 
-            # Append the line to the new PDB lines
             new_pdb_lines.append(line)
 
-        # Write the modified PDB lines to the output file
         with open(output_pdb_file, "w") as f:
             f.writelines(new_pdb_lines)
 
-    def replace_atom_type(self, data):
-        """Replace wrong ligand atom types in the topology PDB file.
+    def replace_atom_type(self, data: str) -> str:
+        """
+        Replace incorrect ligand atom types in the topology PDB file.
 
         Args:
             data (str): Text of the initial PDB file.
@@ -236,18 +171,17 @@ def replace_atom_type(self, data):
         lines = data.split("\n")
         for i, line in enumerate(lines):
             if " LIG  X" in line:
-                # Extract the last column which contains the atom type (O/N/H)
                 atom_type = line[12:13].strip()
-                # Replace 'X' with the correct atom type
                 lines[i] = line.replace(" LIG  X", f" LIG  {atom_type}")
         return "\n".join(lines)
 
-    def process_pdb(self, input_file, output_file):
-        """Wrapper function to process a PDB file.
+    def process_pdb(self, input_file: str, output_file: str) -> None:
+        """
+        Wrapper function to process a PDB file.
 
         Args:
             input_file (str): Path to the input PDB file.
-            output_file (str): Path of the output PDB file.
+            output_file (str): Path to the output PDB file.
         """
         with open(input_file, "r") as f:
             pdb_data = f.read()

From 66bd74d87bce112dc0c91a5e182063314af1e99e Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:07:57 +0200
Subject: [PATCH 05/23] Update barcode_generation.py

added type checking and type hinting
---
 .../openmmdl_analysis/barcode_generation.py   | 103 +++++++++++++-----
 1 file changed, 77 insertions(+), 26 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py
index e362bb83..2ec99d02 100644
--- a/openmmdl/openmmdl_analysis/barcode_generation.py
+++ b/openmmdl/openmmdl_analysis/barcode_generation.py
@@ -1,20 +1,28 @@
 import os
 import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
+from typing import List, Dict, Union
 
 
 class BarcodeGenerator:
-    def __init__(self, df):
+    def __init__(self, df: pd.DataFrame):
         """
         Initializes the BarcodeGenerator with a dataframe.
 
         Args:
-            df (pandas dataframe): Dataframe containing all interactions from plip analysis (typically df_all)
+            df (pd.DataFrame): Dataframe containing all interactions from plip analysis (typically df_all).
         """
         self.df = df
         self.interactions = self.gather_interactions()
 
-    def gather_interactions(self):
+    def gather_interactions(self) -> Dict[str, pd.Index]:
+        """
+        Gathers columns related to different types of interactions.
+
+        Returns:
+            Dict[str, pd.Index]: Dictionary where keys are interaction types and values are columns corresponding to those interactions.
+        """
         hydrophobic_interactions = self.df.filter(regex="hydrophobic").columns
         acceptor_interactions = self.df.filter(regex="Acceptor_hbond").columns
         donor_interactions = self.df.filter(regex="Donor_hbond").columns
@@ -39,15 +47,15 @@ def gather_interactions(self):
             "metal": metal_interactions,
         }
 
-    def generate_barcode(self, interaction):
+    def generate_barcode(self, interaction: str) -> np.ndarray:
         """
         Generates barcodes for a given interaction.
 
         Args:
-            interaction (str): Name of the interaction to generate a barcode for
+            interaction (str): Name of the interaction to generate a barcode for.
 
         Returns:
-            np.array: Binary array with 1 representing the interaction is present in the corresponding frame
+            np.ndarray: Binary array with 1 representing the interaction is present in the corresponding frame.
         """
         barcode = []
         unique_frames = self.df["FRAME"].unique()
@@ -61,20 +69,20 @@ def generate_barcode(self, interaction):
 
         return np.array(barcode)
 
-    def generate_waterids_barcode(self, interaction):
+    def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]:
         """
         Generates a barcode containing corresponding water ids for a given interaction.
 
         Args:
-            interaction (str): Name of the interaction to generate a barcode for
+            interaction (str): Name of the interaction to generate a barcode for.
 
         Returns:
-            list: List of water ids for the frames where the interaction is present, 0 if no interaction present
+            List[Union[int, int]]: List of water ids for the frames where the interaction is present, 0 if no interaction present.
         """
         water_id_list = []
         waterid_barcode = []
 
-        for index, row in self.df.iterrows():
+        for _, row in self.df.iterrows():
             if row[interaction] == 1:
                 water_id_list.append(int(float(row["WATER_IDX"])))
 
@@ -88,15 +96,15 @@ def generate_waterids_barcode(self, interaction):
 
         return waterid_barcode
 
-    def interacting_water_ids(self, waterbridge_interactions):
-        """Generates a list of all water ids that form water bridge interactions.
+    def interacting_water_ids(self, waterbridge_interactions: List[str]) -> List[int]:
+        """
+        Generates a list of all water ids that form water bridge interactions.
 
         Args:
-            df_all (pandas dataframe): dataframe containing all interactions from plip analysis (typicaly df_all)
-            waterbridge_interactions (list): list of strings containing the names of all water bridge interactions
+            waterbridge_interactions (List[str]): List of strings containing the names of all water bridge interactions.
 
         Returns:
-            list: list of all unique water ids that form water bridge interactions
+            List[int]: List of all unique water ids that form water bridge interactions.
         """
         interacting_waters = []
         for waterbridge_interaction in waterbridge_interactions:
@@ -107,12 +115,34 @@ def interacting_water_ids(self, waterbridge_interactions):
         return list(set(interacting_waters))
 
 
+class BarcodeGenerator:
+    def __init__(self, df: pd.DataFrame):
+        self.df = df
+
+    def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]:
+        # Placeholder implementation
+        return []
+
+
 class BarcodePlotter:
-    def __init__(self, df_all):
+    def __init__(self, df_all: pd.DataFrame):
+        """
+        Initializes the BarcodePlotter with a dataframe and BarcodeGenerator instance.
+
+        Args:
+            df_all (pd.DataFrame): Dataframe containing all interactions from plip analysis.
+        """
         self.df_all = df_all
         self.barcode_gen = BarcodeGenerator(df_all)
 
-    def plot_barcodes(self, barcodes, save_path):
+    def plot_barcodes(self, barcodes: Dict[str, np.ndarray], save_path: str) -> None:
+        """
+        Plots barcodes and saves the figure to the specified path.
+
+        Args:
+            barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays.
+            save_path (str): Path to save the plotted figure.
+        """
         if not barcodes:
             print("No barcodes to plot.")
             return
@@ -154,8 +184,19 @@ def plot_barcodes(self, barcodes, save_path):
         plt.savefig(save_path, dpi=300, bbox_inches="tight")
 
     def plot_waterbridge_piechart(
-        self, waterbridge_barcodes, waterbridge_interactions, fig_type
-    ):
+        self,
+        waterbridge_barcodes: Dict[str, np.ndarray],
+        waterbridge_interactions: List[str],
+        fig_type: str
+    ) -> None:
+        """
+        Plots pie charts for waterbridge interactions and saves them to files.
+
+        Args:
+            waterbridge_barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays.
+            waterbridge_interactions (List[str]): List of water bridge interaction names.
+            fig_type (str): File extension for the saved figures (e.g., 'png', 'pdf').
+        """
         if not waterbridge_barcodes:
             print("No Piecharts to plot.")
             return
@@ -201,7 +242,7 @@ def plot_waterbridge_piechart(
             plt.pie(
                 values,
                 labels=labels,
-                autopct=lambda pct: f"{pct:.1f}%\n({int(round(pct/100.0 * sum(values)))})",
+                autopct=lambda pct: f"{pct:.1f}%\n({int(round(pct / 100.0 * sum(values)))})",
                 shadow=False,
                 startangle=140,
             )
@@ -227,8 +268,18 @@ def plot_waterbridge_piechart(
                 dpi=300,
             )
 
-    def plot_barcodes_grouped(self, interactions, interaction_type, fig_type):
-        ligatoms_dict = {}
+    def plot_barcodes_grouped(
+        self, interactions: List[str], interaction_type: str, fig_type: str
+    ) -> None:
+        """
+        Plots grouped barcodes for interactions and saves the figure to a file.
+
+        Args:
+            interactions (List[str]): List of interaction names.
+            interaction_type (str): Type of interaction for grouping.
+            fig_type (str): File extension for the saved figure (e.g., 'png', 'pdf').
+        """
+        ligatoms_dict: Dict[str, List[str]] = {}
         for interaction in interactions:
             ligatom = interaction.split("_")
             ligatom.pop(0)
@@ -246,10 +297,10 @@ def plot_barcodes_grouped(self, interactions, interaction_type, fig_type):
             ligatom = "_".join(ligatom)
             ligatoms_dict.setdefault(ligatom, []).append(interaction)
 
-        total_interactions = {}
-        for ligatom in ligatoms_dict:
-            ligatom_interaction_barcodes = {}
-            for interaction in ligatoms_dict[ligatom]:
+        total_interactions: Dict[str, np.ndarray] = {}
+        for ligatom, interactions_list in ligatoms_dict.items():
+            ligatom_interaction_barcodes: Dict[str, np.ndarray] = {}
+            for interaction in interactions_list:
                 barcode = self.barcode_gen.generate_barcode(interaction)
                 ligatom_interaction_barcodes[interaction] = barcode
             os.makedirs(f"./Barcodes/{ligatom}", exist_ok=True)

From 50ab1ba7469c6bb90206ecf6d1fd53a4b659390f Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:12:39 +0200
Subject: [PATCH 06/23] Update markov_state_figure_generation.py

added type checking and hinting
---
 .../markov_state_figure_generation.py         | 239 +++++++-----------
 1 file changed, 95 insertions(+), 144 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
index fd9d398d..7e40c020 100644
--- a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
+++ b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
@@ -2,18 +2,18 @@
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch
 import os
-
+from typing import Dict, List, Tuple, Union
 
 class MarkovChainAnalysis:
-    def __init__(self, min_transition):
+    def __init__(self, min_transition: float):
         self.min_transition = min_transition
-        self.min_transitions = self.calculate_min_transitions()
+        self.min_transitions: List[float] = self.calculate_min_transitions()
 
-    def calculate_min_transitions(self):
+    def calculate_min_transitions(self) -> List[float]:
         """Calculates a list based on the minimum transition time provided values and returns it in factors 1, 2, 5, 10.
 
         Returns:
-            list: List with the minimum transition time with factors 1, 2, 5, 10.
+            List[float]: List with the minimum transition time with factors 1, 2, 5, 10.
         """
         min_transitions = [
             self.min_transition,
@@ -24,42 +24,47 @@ def calculate_min_transitions(self):
         return min_transitions
 
     def generate_transition_graph(
-        self, total_frames, combined_dict, fig_type="png", font_size=36, size_node=200
-    ):
+        self,
+        total_frames: int,
+        combined_dict: Dict[str, List[str]],
+        fig_type: str = "png",
+        font_size: int = 36,
+        size_node: int = 200
+    ) -> None:
         """Generate Markov Chain plots based on transition probabilities.
 
         Args:
             total_frames (int): The number of frames in the protein-ligand MD simulation.
-            combined_dict (dict): A dictionary with the information of the Binding Modes and their order of appearance during the simulation for all frames.
+            combined_dict (Dict[str, List[str]]): A dictionary with the information of the Binding Modes and their order of appearance during the simulation for all frames.
             fig_type (str, optional): File type for the output figures. Default is 'png'.
             font_size (int, optional): The font size for the node labels. The default value is set to 36.
             size_node (int, optional): The size of the nodes in the Markov Chain plot. The default value is set to 200.
         """
         # Calculate the number of elements in each part
-        total_length = len(combined_dict["all"])
-        part_length = total_length // 3
-        remaining_length = total_length % 3
+        total_length: int = len(combined_dict["all"])
+        part_length: int = total_length // 3
+        remaining_length: int = total_length % 3
 
         # Divide the 'all_data' into three parts
-        part1_length = part_length + remaining_length
-        part2_length = part_length
-        part1_data = combined_dict["all"][:part1_length]
-        part2_data = combined_dict["all"][part1_length : part1_length + part2_length]
-        part3_data = combined_dict["all"][part1_length + part2_length :]
+        part1_length: int = part_length + remaining_length
+        part2_length: int = part_length
+        part1_data: List[str] = combined_dict["all"][:part1_length]
+        part2_data: List[str] = combined_dict["all"][part1_length: part1_length + part2_length]
+        part3_data: List[str] = combined_dict["all"][part1_length + part2_length:]
 
         # Count the occurrences of each node in each part
-        part1_node_occurrences = {
+        part1_node_occurrences: Dict[str, int] = {
             node: part1_data.count(node) for node in set(part1_data)
         }
-        part2_node_occurrences = {
+        part2_node_occurrences: Dict[str, int] = {
             node: part2_data.count(node) for node in set(part2_data)
         }
-        part3_node_occurrences = {
+        part3_node_occurrences: Dict[str, int] = {
             node: part3_data.count(node) for node in set(part3_data)
         }
 
         # Create the legend
-        legend_labels = {
+        legend_labels: Dict[str, str] = {
             "Blue": "Binding mode not in top 10 occurrence",
             "Green": "Binding Mode occurrence mostly in first third of frames",
             "Orange": "Binding Mode occurrence mostly in second third of frames",
@@ -67,134 +72,102 @@ def generate_transition_graph(
             "Yellow": "Binding Mode occurs throughout all trajectory equally",
         }
 
-        legend_colors = ["skyblue", "green", "orange", "red", "yellow"]
+        legend_colors: List[str] = ["skyblue", "green", "orange", "red", "yellow"]
 
-        legend_handles = [
+        legend_handles: List[Patch] = [
             Patch(color=color, label=label)
             for color, label in zip(legend_colors, legend_labels.values())
         ]
 
         # Get the top 10 nodes with the most occurrences
-        node_occurrences = {
+        node_occurrences: Dict[str, int] = {
             node: combined_dict["all"].count(node) for node in set(combined_dict["all"])
         }
-        top_10_nodes = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[
-            :10
-        ]
+        top_10_nodes: List[str] = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10]
 
         for min_transition_percent in self.min_transitions:
-            min_prob = min_transition_percent / 100  # Convert percentage to probability
+            min_prob: float = min_transition_percent / 100  # Convert percentage to probability
 
             # Create a directed graph
-            G = nx.DiGraph()
+            G: nx.DiGraph = nx.DiGraph()
 
             # Count the occurrences of each transition and self-loop
-            transitions = {}
-            self_loops = {}
+            transitions: Dict[Tuple[str, str], int] = {}
+            self_loops: Dict[Tuple[str, str], int] = {}
             for i in range(len(combined_dict["all"]) - 1):
-                current_state = combined_dict["all"][i]
-                next_state = combined_dict["all"][i + 1]
+                current_state: str = combined_dict["all"][i]
+                next_state: str = combined_dict["all"][i + 1]
 
                 if current_state == next_state:  # Check for self-loop
-                    self_loop = (current_state, next_state)
+                    self_loop: Tuple[str, str] = (current_state, next_state)
                     self_loops[self_loop] = self_loops.get(self_loop, 0) + 1
                 else:
-                    transition = (current_state, next_state)
+                    transition: Tuple[str, str] = (current_state, next_state)
                     transitions[transition] = transitions.get(transition, 0) + 1
 
             # Add edges to the graph with their probabilities
             for transition, count in transitions.items():
                 current_state, next_state = transition
-                probability = (
-                    count / len(combined_dict["all"]) * 100
-                )  # Convert probability to percentage
+                probability: float = (count / len(combined_dict["all"])) * 100  # Convert probability to percentage
                 if probability >= min_transition_percent:
                     G.add_edge(current_state, next_state, weight=probability)
                     # Include the reverse transition with a different color
-                    reverse_transition = (next_state, current_state)
-                    reverse_count = transitions.get(
-                        reverse_transition, 0
-                    )  # Use the correct count for the reverse transition
-                    reverse_probability = (
-                        reverse_count / len(combined_dict["all"]) * 100
-                    )
-                    G.add_edge(
-                        next_state,
-                        current_state,
-                        weight=reverse_probability,
-                        reverse=True,
-                    )
+                    reverse_transition: Tuple[str, str] = (next_state, current_state)
+                    reverse_count: int = transitions.get(reverse_transition, 0)  # Use the correct count for the reverse transition
+                    reverse_probability: float = (reverse_count / len(combined_dict["all"])) * 100
+                    G.add_edge(next_state, current_state, weight=reverse_probability, reverse=True)
 
             # Add self-loops to the graph with their probabilities
             for self_loop, count in self_loops.items():
-                state = self_loop[0]
-                probability = (
-                    count / len(combined_dict["all"]) * 100
-                )  # Convert probability to percentage
+                state: str = self_loop[0]
+                probability: float = (count / len(combined_dict["all"])) * 100  # Convert probability to percentage
                 if probability >= min_transition_percent:
                     G.add_edge(state, state, weight=probability)
 
             # Calculate transition probabilities for each direction (excluding self-loops)
-            transition_probabilities_forward = {}
-            transition_probabilities_backward = {}
-            transition_occurrences_forward = {}
-            transition_occurrences_backward = {}
+            transition_probabilities_forward: Dict[Tuple[str, str], float] = {}
+            transition_probabilities_backward: Dict[Tuple[str, str], float] = {}
+            transition_occurrences_forward: Dict[Tuple[str, str], float] = {}
+            transition_occurrences_backward: Dict[Tuple[str, str], float] = {}
 
             for transition, count in transitions.items():
                 start_state, end_state = transition
-                forward_transition = (start_state, end_state)
-                backward_transition = (end_state, start_state)
+                forward_transition: Tuple[str, str] = (start_state, end_state)
+                backward_transition: Tuple[str, str] = (end_state, start_state)
 
                 # Separate counts for forward and backward transitions
-                forward_count = transitions.get(forward_transition, 0)
-                backward_count = transitions.get(backward_transition, 0)
+                forward_count: int = transitions.get(forward_transition, 0)
+                backward_count: int = transitions.get(backward_transition, 0)
 
-                transition_probabilities_forward[forward_transition] = (
-                    forward_count / node_occurrences[start_state] * 100
-                )
+                transition_probabilities_forward[forward_transition] = (forward_count / node_occurrences[start_state]) * 100
+                transition_occurrences_forward[forward_transition] = (forward_count / len(combined_dict["all"])) * 100
 
-                transition_occurrences_forward[forward_transition] = (
-                    forward_count / len(combined_dict["all"]) * 100
-                )
-
-                transition_probabilities_backward[backward_transition] = (
-                    backward_count / node_occurrences[end_state] * 100
-                )
-
-                transition_occurrences_backward[backward_transition] = (
-                    backward_count / len(combined_dict["all"]) * 100
-                )
+                transition_probabilities_backward[backward_transition] = (backward_count / node_occurrences[end_state]) * 100
+                transition_occurrences_backward[backward_transition] = (backward_count / len(combined_dict["all"])) * 100
 
             # Calculate self-loop probabilities
-            self_loop_probabilities = {}
-            self_loop_occurences = {}
+            self_loop_probabilities: Dict[str, float] = {}
+            self_loop_occurences: Dict[str, float] = {}
             for self_loop, count in self_loops.items():
-                state = self_loop[0]
+                state: str = self_loop[0]
                 self_loop_probabilities[state] = count / node_occurrences[state]
                 self_loop_occurences[state] = count / len(combined_dict["all"]) * 100
 
             # Generate the Markov Chain plot
             plt.figure(figsize=(60, 60))  # Increased figure size
-            plt.title(
-                f"Markov Chain Plot {min_transition_percent}% Frames Transition",
-                fontsize=72,
-            )
+            plt.title(f"Markov Chain Plot {min_transition_percent}% Frames Transition", fontsize=72)
 
             # Draw nodes and edges
-            pos = nx.spring_layout(
-                G, k=2, seed=42
-            )  # Increased distance between nodes (k=2)
-            edge_colors = []
+            pos: Dict[str, Tuple[float, float]] = nx.spring_layout(G, k=2, seed=42)  # Increased distance between nodes (k=2)
+            edge_colors: List[str] = []
 
             for u, v, data in G.edges(data=True):
-                weight = data["weight"]
+                weight: float = data["weight"]
 
                 if u == v:  # Check if it is a self-loop
                     edge_colors.append("green")  # Set green color for self-loop arrows
-                    width = 0.1  # Make self-loop arrows smaller
-                    connection_style = (
-                        "arc3,rad=-0.1"  # Make the self-loops more curved and compact
-                    )
+                    width: float = 0.1  # Make self-loop arrows smaller
+                    connection_style: str = "arc3,rad=-0.1"  # Make the self-loops more curved and compact
                     nx.draw_networkx_edges(
                         G,
                         pos,
@@ -205,18 +178,16 @@ def generate_transition_graph(
                         connectionstyle=connection_style,
                     )
                 elif weight >= min_transition_percent:
-                    edge_colors.append(
-                        "black"
-                    )  # Highlight significant transitions in red
+                    edge_colors.append("black")  # Highlight significant transitions in black
 
                     # Check if both nodes are present before adding labels
                     if G.has_node(u) and G.has_node(v):
-                        width = 4.0
-                        forward_label = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
-                        backward_label = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
-                        edge_label = f"{forward_label}\n{backward_label}"
+                        width: float = 4.0
+                        forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
+                        backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        edge_label: str = f"{forward_label}\n{backward_label}"
 
-                        connection_style = "arc3,rad=-0.1"
+                        connection_style: str = "arc3,rad=-0.1"
                         nx.draw_networkx_edges(
                             G,
                             pos,
@@ -226,22 +197,18 @@ def generate_transition_graph(
                             edge_color=edge_colors[-1],
                             connectionstyle=connection_style,
                         )
-                        nx.draw_networkx_edge_labels(
-                            G, pos, edge_labels={(u, v): edge_label}, font_size=26
-                        )
+                        nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=26)
                 else:
-                    edge_colors.append(
-                        "grey"
-                    )  # Use black for non-significant transitions
-                    width = 0.5
+                    edge_colors.append("grey")  # Use grey for non-significant transitions
+                    width: float = 0.5
 
                     # Check if both nodes are present before adding labels
                     if G.has_node(u) and G.has_node(v):
-                        forward_label = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
-                        backward_label = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
-                        edge_label = f"{forward_label}\n{backward_label}"
+                        forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
+                        backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        edge_label: str = f"{forward_label}\n{backward_label}"
 
-                        connection_style = "arc3,rad=-0.1"
+                        connection_style: str = "arc3,rad=-0.1"
                         nx.draw_networkx_edges(
                             G,
                             pos,
@@ -251,23 +218,15 @@ def generate_transition_graph(
                             edge_color=edge_colors[-1],
                             connectionstyle=connection_style,
                         )
-                        nx.draw_networkx_edge_labels(
-                            G, pos, edge_labels={(u, v): edge_label}, font_size=36
-                        )
+                        nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=36)
 
             # Update the node colors based on their appearance percentages in each part
-            node_colors = []
+            node_colors: List[str] = []
             for node in G.nodes():
                 if node in top_10_nodes:
-                    part1_percentage = (
-                        part1_node_occurrences.get(node, 0) / node_occurrences[node]
-                    )
-                    part2_percentage = (
-                        part2_node_occurrences.get(node, 0) / node_occurrences[node]
-                    )
-                    part3_percentage = (
-                        part3_node_occurrences.get(node, 0) / node_occurrences[node]
-                    )
+                    part1_percentage: float = (part1_node_occurrences.get(node, 0) / node_occurrences[node])
+                    part2_percentage: float = (part2_node_occurrences.get(node, 0) / node_occurrences[node])
+                    part3_percentage: float = (part3_node_occurrences.get(node, 0) / node_occurrences[node])
 
                     if part1_percentage > 0.5:
                         node_colors.append("green")
@@ -281,22 +240,20 @@ def generate_transition_graph(
                     node_colors.append("skyblue")
 
             # Draw nodes with sizes correlated to occurrences and color top 10 nodes
-            node_size = [size_node * node_occurrences[node] for node in G.nodes()]
-            nx.draw_networkx_nodes(
-                G, pos, node_size=node_size, node_color=node_colors, alpha=0.8
-            )
+            node_size: List[int] = [size_node * node_occurrences[node] for node in G.nodes()]
+            nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_colors, alpha=0.8)
 
             # Draw node labels with occurrence percentage and self-loop probability for nodes with edges
-            node_labels = {}
+            node_labels: Dict[str, str] = {}
 
             for node in G.nodes():
                 if G.degree(node) > 0:  # Check if the node has at least one edge
-                    edges_with_node = [
+                    edges_with_node: List[Tuple[str, str, float]] = [
                         (u, v, data["weight"])
                         for u, v, data in G.edges(data=True)
                         if u == node or v == node
                     ]
-                    relevant_edges = [
+                    relevant_edges: List[Tuple[str, str, float]] = [
                         edge
                         for edge in edges_with_node
                         if edge[2] >= min_transition_percent
@@ -304,14 +261,10 @@ def generate_transition_graph(
 
                     if relevant_edges:
                         if node in top_10_nodes:
-                            node_occurrence_percentage = (
-                                node_occurrences[node] / len(combined_dict["all"]) * 100
-                            )
-                            self_loop_probability = (
-                                self_loop_probabilities.get(node, 0) * 100
-                            )
-                            self_loop_occurence = self_loop_occurences.get(node, 0)
-                            node_label = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%"
+                            node_occurrence_percentage: float = (node_occurrences[node] / len(combined_dict["all"])) * 100
+                            self_loop_probability: float = self_loop_probabilities.get(node, 0) * 100
+                            self_loop_occurence: float = self_loop_occurences.get(node, 0)
+                            node_label: str = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%"
                             node_labels[node] = node_label
                         else:
                             node_labels[node] = node
@@ -332,11 +285,9 @@ def generate_transition_graph(
             plt.tight_layout()
 
             # Save the plot
-            plot_filename = f"markov_chain_plot_{min_transition_percent}.{fig_type}"
-            plot_path = os.path.join("Binding_Modes_Markov_States", plot_filename)
-            os.makedirs(
-                "Binding_Modes_Markov_States", exist_ok=True
-            )  # Create the folder if it doesn't exist
+            plot_filename: str = f"markov_chain_plot_{min_transition_percent}.{fig_type}"
+            plot_path: str = os.path.join("Binding_Modes_Markov_States", plot_filename)
+            os.makedirs("Binding_Modes_Markov_States", exist_ok=True)  # Create the folder if it doesn't exist
 
             plt.savefig(plot_path, dpi=300)
             plt.clf()  # Clear the current figure for the next iteration

From b43a021b075b0843f7927b2e8f871114b609d29b Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:19:57 +0200
Subject: [PATCH 07/23] Update find_stable_waters.py

added type checking and hinting
---
 .../openmmdl_analysis/find_stable_waters.py   | 116 ++++++------------
 1 file changed, 39 insertions(+), 77 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py
index a67659a3..88e6ccb2 100644
--- a/openmmdl/openmmdl_analysis/find_stable_waters.py
+++ b/openmmdl/openmmdl_analysis/find_stable_waters.py
@@ -5,76 +5,62 @@
 from sklearn.cluster import DBSCAN
 from tqdm import tqdm
 from io import StringIO
-from Bio.PDB import PDBParser
-
+from Bio.PDB import PDBParser, Structure
+from typing import Tuple, Dict, List, Optional
 
 class StableWaters:
-    def __init__(self, trajectory, topology, water_eps):
+    def __init__(self, trajectory: str, topology: str, water_eps: float) -> None:
         self.trajectory = trajectory
         self.topology = topology
         self.u = mda.Universe(self.topology, self.trajectory)
         self.water_eps = water_eps
 
-    def trace_waters(self, output_directory):
-        """trace the water molecules in a trajectory and write all which move below one Angstrom distance. To adjust the distance alter the integer
+    def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
+        """Trace the water molecules in a trajectory and write all which move below one Angstrom distance. To adjust the distance alter the integer
         Args:
-            topology (str): Path to the topology file.
-            trajectory (str): Path to the trajectory file.
             output_directory (str): Directory where output files will be saved.
 
         Returns:
-            pd.DataFrame: DataFrame containing stable water coordinates.
-            int: Total number of frames.
+            Tuple[pd.DataFrame, int]: DataFrame containing stable water coordinates and total number of frames.
         """
-        # Get the total number of frames for the progress bar
         total_frames = len(self.u.trajectory)
-        # Create an empty DataFrame to store stable water coordinates
         stable_waters = pd.DataFrame(
             columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"]
         )
 
-        # Initialize variables for the previous frame's data
         prev_frame_coords = {}
 
-        # Iterate through frames with tqdm for the progress bar
         for ts in tqdm(
             self.u.trajectory,
             total=total_frames,
-            desc="Processing frames for the wateranalysis",
+            desc="Processing frames for the water analysis",
         ):
             frame_num = ts.frame
             frame_coords = {}
 
-            # Iterate through oxygen atoms of the specified water type
-            # for atom in u.select_atoms(f"resname {water_type} and name O"):
-            for atom in self.u.select_atoms(f"resname HOH and name O"):
+            for atom in self.u.select_atoms("resname HOH and name O"):
                 frame_coords[atom.index] = (
                     atom.position[0],
                     atom.position[1],
                     atom.position[2],
                 )
 
-            # Check if it's not the first frame
             if frame_num > 0:
                 stable_coords = []
 
-                # Iterate through the oxygen atoms in the current frame
                 for atom_index, coords in frame_coords.items():
                     prev_coords = prev_frame_coords.get(
                         atom_index, coords
-                    )  # Get previous coordinates or use current if not found
+                    )
 
-                    # Calculate the distance between current and previous coordinates
                     distance = np.linalg.norm(np.array(coords) - np.array(prev_coords))
 
-                    # If the distance is less than 1 Angstrom, consider it a stable water
                     if distance < 1:
                         stable_coords.append(
                             (frame_num, atom_index, coords[0], coords[1], coords[2])
                         )
 
-                # Append stable water coordinates to the stable_waters DataFrame
-                if stable_coords:  # Check if stable_coords is not empty
+                if stable_coords:
                     stable_waters = pd.concat(
                         [
                             stable_waters,
@@ -91,7 +77,6 @@ def trace_waters(self, output_directory):
                         ]
                     )
 
-            # Update the previous frame's coordinates
             prev_frame_coords = frame_coords
 
         stable_waters.to_csv(
@@ -100,21 +85,19 @@ def trace_waters(self, output_directory):
         return stable_waters, total_frames
 
     def perform_clustering_and_writing(
-        self, stable_waters, cluster_eps, total_frames, output_directory
-    ):
+        self, stable_waters: pd.DataFrame, cluster_eps: float, total_frames: int, output_directory: str
+    ) -> None:
         """
         Perform DBSCAN clustering on the stable water coordinates, and write the clusters and their representatives to PDB files.
 
         Args:
             stable_waters (pd.DataFrame): DataFrame containing stable water coordinates.
-            cluster_eps (float): DBSCAN clustering epsilon parameter. This is in Angstrom in this case, and defines which Water distances should be within one cluster
+            cluster_eps (float): DBSCAN clustering epsilon parameter.
             total_frames (int): Total number of frames.
             output_directory (str): Directory where output files will be saved.
         """
-        # Feature extraction: XYZ coordinates
         X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]]
 
-        # List of percentages to iterate over
         percentage_values = [25, 50, 75, 90, 99]
 
         for percent in percentage_values:
@@ -138,8 +121,8 @@ def perform_clustering_and_writing(
             )
 
     def write_pdb_clusters_and_representatives(
-        self, clustered_waters, min_samples, output_sub_directory
-    ):
+        self, clustered_waters: pd.DataFrame, min_samples: int, output_sub_directory: str
+    ) -> None:
         """
         Writes the clusters and their representatives to PDB files.
 
@@ -147,7 +130,6 @@ def write_pdb_clusters_and_representatives(
             clustered_waters (pd.DataFrame): DataFrame containing clustered water coordinates.
             min_samples (int): Minimum number of samples for DBSCAN clustering.
             output_sub_directory (str): Subdirectory where output PDB files will be saved.
-
         """
         atom_counter = 1
         pdb_file_counter = 1
@@ -156,7 +138,7 @@ def write_pdb_clusters_and_representatives(
         os.makedirs(output_sub_directory, exist_ok=True)
         with pd.option_context(
             "display.max_rows", None
-        ):  # Temporarily set display options
+        ):
             for label, cluster in clustered_waters.groupby("Cluster_Label"):
                 pdb_lines = []
                 for _, row in cluster.iterrows():
@@ -166,7 +148,6 @@ def write_pdb_clusters_and_representatives(
                     pdb_lines.append(pdb_line)
                     atom_counter += 1
 
-                # Write the current cluster to a new PDB file
                 output_filename = os.path.join(
                     output_sub_directory, f"cluster_{label}.pdb"
                 )
@@ -176,7 +157,6 @@ def write_pdb_clusters_and_representatives(
 
                 pdb_file_counter += 1
 
-            # Write representative water molecules to a PDB file
             representative_waters = clustered_waters.groupby("Cluster_Label").mean()
             representative_waters.reset_index(inplace=True)
             representative_filename = os.path.join(
@@ -188,36 +168,27 @@ def write_pdb_clusters_and_representatives(
                     pdb_line = f"ATOM{index + 1:6}  O   WAT A{index + 1:4}    {x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00           O\n"
                     pdb_file.write(pdb_line)
 
-    # Example usage
-    # stable_waters_pipeline("topology_file", "trajectory_file", 0.5)
-    def stable_waters_pipeline(self, output_directory="./stableWaters"):
+    def stable_waters_pipeline(self, output_directory: str = "./stableWaters") -> None:
         """Function to run the pipeline to extract stable water clusters, and their representatives from a PDB & DCD file
         Args:
-            topology (str): Path to the topology file.
-            trajectory (str): Path to the trajectory file.
-            water_eps (float): DBSCAN clustering epsilon parameter.
             output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters".
-
         """
-        # Load the PDB and DCD files
         output_directory += "_clusterEps_"
         strEps = str(self.water_eps).replace(".", "")
         output_directory += strEps
         os.makedirs(output_directory, exist_ok=True)
-        # Create a stable waters list by calling the process_trajectory_and_cluster function
         stable_waters, total_frames = self.trace_waters(output_directory)
-        # Now call perform_clustering_and_writing with the returned values
         self.perform_clustering_and_writing(
             stable_waters, self.water_eps, total_frames, output_directory
         )
 
-    def filter_and_parse_pdb(protein_pdb):
-        """This function reads in a PDB and returns the structure with bioparser.
+    def filter_and_parse_pdb(self, protein_pdb: str) -> Structure:
+        """Reads in a PDB and returns the structure with bioparser.
         Args:
             protein_pdb (str): Path to a protein PDB file.
 
         Returns:
-            biopython.structure: PDB structure object.
+            Structure: Biopython PDB structure object.
         """
         with open(protein_pdb, "r") as pdb_file:
             lines = [
@@ -226,38 +197,35 @@ def filter_and_parse_pdb(protein_pdb):
                 if (
                     line.startswith("ATOM")
                     and line[17:20].strip() not in ["HOH", "WAT", "T4P", "T3P"]
-                    and line[22:26]
-                    .strip()
-                    .isdigit()  # Exclude lines with non-numeric sequence identifiers
+                    and line[22:26].strip().isdigit()
                 )
             ]
 
-        # Convert the list of lines to a string buffer
         pdb_string = "".join(lines)
         pdb_buffer = StringIO(pdb_string)
 
-        # Now parse the filtered lines
         parser = PDBParser(QUIET=True)
         structure = parser.get_structure("protein", pdb_buffer)
 
         return structure
 
-    def find_interacting_residues(structure, representative_waters, distance_threshold):
-        """This function maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. Use "filter_and_parse_pdb" to get the input for this function
+    def find_interacting_residues(
+        self, structure: Structure, representative_waters: pd.DataFrame, distance_threshold: float
+    ) -> Dict[int, List[Tuple[str, int]]]:
+        """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input.
         Args:
-            structure (biopython.structure): Biopython PDB structure object.
-            representative_waters (pandasd.DataFrame): DataFrame containing representative water coordinates.
+            structure (Structure): Biopython PDB structure object.
+            representative_waters (pd.DataFrame): DataFrame containing representative water coordinates.
             distance_threshold (float): Threshold distance for identifying interacting residues.
 
         Returns:
-            dict: Dictionary mapping cluster numbers to interacting residues.
+            Dict[int, List[Tuple[str, int]]]: Dictionary mapping cluster numbers to interacting residues.
         """
         interacting_residues = {}
 
         for model in structure:
             for chain in model:
                 for residue in chain:
-                    # Check if the residue is a protein residue (not a heteroatom or water molecule)
                     if (
                         residue.id[0] == " "
                         and residue.id[2] == " "
@@ -277,7 +245,7 @@ def find_interacting_residues(structure, representative_waters, distance_thresho
 
                             distance = np.linalg.norm(wat_coords - residue_coords)
                             if distance < distance_threshold:
-                                key = wat_index  # Assuming wat_index is the number of the water cluster
+                                key = wat_index
                                 if key not in interacting_residues:
                                     interacting_residues[key] = []
                                 interacting_residues[key].append(
@@ -286,19 +254,18 @@ def find_interacting_residues(structure, representative_waters, distance_thresho
 
         return interacting_residues
 
-    def read_pdb_as_dataframe(pdb_file):
+    def read_pdb_as_dataframe(self, pdb_file: str) -> pd.DataFrame:
         """Helper function reading a PDB
         Args:
             pdb_file (str): Path to the PDB file.
 
         Returns:
-            pandas.DataFrame: DataFrame containing PDB data.
+            pd.DataFrame: DataFrame containing PDB data.
         """
         lines = []
         with open(pdb_file, "r") as f:
             lines = f.readlines()
 
-        # Extract relevant information from PDB file lines
         data = []
         for line in lines:
             if line.startswith("ATOM"):
@@ -307,25 +274,23 @@ def read_pdb_as_dataframe(pdb_file):
                 z = float(line[46:54].strip())
                 data.append([x, y, z])
 
-        # Create a DataFrame
         columns = ["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]
         representative_waters = pd.DataFrame(data, columns=columns)
 
         return representative_waters
 
-    # Analyse protein and water interaction, get the residues and the corresponding weater molecules that interact.
     def analyze_protein_and_water_interaction(
         self,
-        protein_pdb_file,
-        representative_waters_file,
-        cluster_eps,
-        output_directory="./stableWaters",
-        distance_threshold=5.0,
-    ):
-        """Analyse the interaction of residues to water molecules using a threshold that can be specified when calling the function
+        protein_pdb_file: str,
+        representative_waters_file: str,
+        cluster_eps: float,
+        output_directory: str = "./stableWaters",
+        distance_threshold: float = 5.0,
+    ) -> None:
+        """Analyze the interaction of residues to water molecules using a threshold that can be specified when calling the function
         Args:
             protein_pdb_file (str): Path to the protein PDB file without waters.
-            representative_waters_file (str): Path to the representative waters PDB file, or any PDB file containing only waters
+            representative_waters_file (str): Path to the representative waters PDB file, or any PDB file containing only waters.
             cluster_eps (float): DBSCAN clustering epsilon parameter.
             output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters".
             distance_threshold (float, optional): Threshold distance for identifying interacting residues. Default is 5.0 (Angstrom).
@@ -334,11 +299,9 @@ def analyze_protein_and_water_interaction(
         strEps = str(cluster_eps).replace(".", "")
         output_directory += strEps
 
-        # Iterate over subdirectories
         for subdirectory in os.listdir(output_directory):
             subdirectory_path = os.path.join(output_directory, subdirectory)
             if os.path.isdir(subdirectory_path):
-                # Perform operations within each subdirectory
                 representative_waters = self.read_pdb_as_dataframe(
                     os.path.join(subdirectory_path, representative_waters_file)
                 )
@@ -350,7 +313,6 @@ def analyze_protein_and_water_interaction(
                     interacting_residues.items(),
                     columns=["Cluster_Number", "Interacting_Residues"],
                 )
-                # Save result to each subdirectory
                 result_df.to_csv(
                     os.path.join(subdirectory_path, "interacting_residues.csv"),
                     index=False,

From 7d4b2423008e0514a7843a5909442f6696cb9572 Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:25:01 +0200
Subject: [PATCH 08/23] Update interaction_gathering.py

added type checking and hinting
---
 .../interaction_gathering.py                  | 342 +++++-------------
 1 file changed, 90 insertions(+), 252 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py
index aa56fbb0..30e697be 100644
--- a/openmmdl/openmmdl_analysis/interaction_gathering.py
+++ b/openmmdl/openmmdl_analysis/interaction_gathering.py
@@ -3,17 +3,22 @@
 import MDAnalysis as mda
 from tqdm import tqdm
 from plip.basic import config
-from plip.structure.preparation import PDBComplex, LigandFinder, Mol, PLInteraction
+from plip.structure.preparation import PDBComplex, PLInteraction
 from plip.exchange.report import BindingSiteReport
 from multiprocessing import Pool
-from functools import partial
+from typing import Optional, List, Dict, Tuple
 
 config.KEEPMOD = True
 
-
 class InteractionAnalyzer:
     def __init__(
-        self, pdb_md, dataframe, num_processes, lig_name, special_ligand, peptide
+        self,
+        pdb_md: mda.Universe,
+        dataframe: Optional[str],
+        num_processes: int,
+        lig_name: str,
+        special_ligand: Optional[str],
+        peptide: Optional[str]
     ):
         self.pdb_md = pdb_md
         self.dataframe = dataframe
@@ -25,15 +30,15 @@ def __init__(
 
     def characterize_complex(
         self, pdb_file: str, binding_site_id: str
-    ) -> PLInteraction:
-        """Characterize the protein-ligand complex and return their interaction set
+    ) -> Optional[PLInteraction]:
+        """Characterize the protein-ligand complex and return their interaction set.
 
         Args:
-            pdb_file (str): A string, which represents the path to the PDB File
-            binding_site_id (str): A string that specifies the identifier of the binding site
+            pdb_file (str): Path to the PDB file.
+            binding_site_id (str): Identifier of the binding site.
 
         Returns:
-            PLInteraction: A object representing the interactions if. If Binding site is not found returns None
+            Optional[PLInteraction]: Interaction object or None if the binding site is not found.
         """
         pdb_complex = PDBComplex()
         pdb_complex.load_pdb(pdb_file)
@@ -43,31 +48,30 @@ def characterize_complex(
                 == binding_site_id
             ):
                 pdb_complex.characterize_complex(ligand)
+                return pdb_complex.interaction_sets.get(binding_site_id)
+        return None
 
-        return pdb_complex.interaction_sets[binding_site_id]
-
-    def retrieve_plip_interactions(self, pdb_file, lig_name):
+    def retrieve_plip_interactions(
+        self, pdb_file: str, lig_name: str
+    ) -> Dict[str, Dict[str, List]]:
         """Retrieves the interactions from PLIP.
 
         Args:
-            pdb_file (str): The path of the PDB file of the complex.
-            lig_name (str): Name of the Ligand in the complex topology that will be analyzed.
+            pdb_file (str): Path of the PDB file of the complex.
+            lig_name (str): Name of the ligand in the complex.
 
         Returns:
-            dict: A dictionary of the binding sites and the interactions.
+            Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions.
         """
         protlig = PDBComplex()
-        protlig.load_pdb(pdb_file)  # load the pdb file
+        protlig.load_pdb(pdb_file)
         for ligand in protlig.ligands:
             if str(ligand.longname) == lig_name:
-                protlig.characterize_complex(
-                    ligand
-                )  # find ligands and analyze interactions
+                protlig.characterize_complex(ligand)
+        
         sites = {}
-        # loop over binding sites
         for key, site in sorted(protlig.interaction_sets.items()):
-            binding_site = BindingSiteReport(site)  # collect data about interactions
-            # tuples of *_features and *_info will be converted to pandas DataFrame
+            binding_site = BindingSiteReport(site)
             keys = (
                 "hydrophobic",
                 "hbond",
@@ -78,9 +82,6 @@ def retrieve_plip_interactions(self, pdb_file, lig_name):
                 "halogen",
                 "metal",
             )
-            # interactions is a dictionary which contains relevant information for each
-            # of the possible interactions: hydrophobic, hbond, etc. in the considered
-            # binding site.
             interactions = {
                 k: [getattr(binding_site, k + "_features")]
                 + getattr(binding_site, k + "_info")
@@ -90,26 +91,24 @@ def retrieve_plip_interactions(self, pdb_file, lig_name):
 
         return sites
 
-    def retrieve_plip_interactions_peptide(self, pdb_file):
-        """Retrives the interactions from PLIP for a peptide.
+    def retrieve_plip_interactions_peptide(
+        self, pdb_file: str
+    ) -> Dict[str, Dict[str, List]]:
+        """Retrieves the interactions from PLIP for a peptide.
 
         Args:
-            pdb_file (str): The path of the PDB file of the complex.
-            peptide (str): Chainid of the peptide that will be analyzed.
+            pdb_file (str): Path of the PDB file of the complex.
 
         Returns:
-            dict: A dictionary of the binding sites and the interactions.
+            Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions.
         """
         protlig = PDBComplex()
-        protlig.load_pdb(pdb_file)  # load the pdb file
-        protlig.characterize_complex(
-            protlig.ligands[-1]
-        )  # find ligands and analyze interactions
+        protlig.load_pdb(pdb_file)
+        protlig.characterize_complex(protlig.ligands[-1])
+        
         sites = {}
-        # loop over binding sites
         for key, site in sorted(protlig.interaction_sets.items()):
-            binding_site = BindingSiteReport(site)  # collect data about interactions
-            # tuples of *_features and *_info will be converted to pandas DataFrame
+            binding_site = BindingSiteReport(site)
             keys = (
                 "hydrophobic",
                 "hbond",
@@ -120,9 +119,6 @@ def retrieve_plip_interactions_peptide(self, pdb_file):
                 "halogen",
                 "metal",
             )
-            # interactions is a dictionary which contains relevant information for each
-            # of the possible interactions: hydrophobic, hbond, etc. in the considered
-            # binding site.
             interactions = {
                 k: [getattr(binding_site, k + "_features")]
                 + getattr(binding_site, k + "_info")
@@ -133,19 +129,18 @@ def retrieve_plip_interactions_peptide(self, pdb_file):
         return sites
 
     def create_df_from_binding_site(
-        self, selected_site_interactions, interaction_type="hbond"
-    ):
-        """Creates a data frame from a binding site and interaction type.
+        self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond"
+    ) -> pd.DataFrame:
+        """Creates a DataFrame from a binding site and interaction type.
 
         Args:
-            selected_site_interactions (dict): Precaluclated interactions from PLIP for the selected site
-            interaction_type (str, optional): The interaction type of interest (default set to hydrogen bond). Defaults to "hbond".
+            selected_site_interactions (Dict[str, List]): Pre-calculated interactions from PLIP.
+            interaction_type (str, optional): The interaction type of interest. Defaults to "hbond".
 
         Returns:
-            pandas dataframe: DataFrame with information retrieved from PLIP.
+            pd.DataFrame: DataFrame with information retrieved from PLIP.
         """
-        # check if interaction type is valid:
-        valid_types = [
+        valid_types = (
             "hydrophobic",
             "hbond",
             "waterbridge",
@@ -154,7 +149,7 @@ def create_df_from_binding_site(
             "pication",
             "halogen",
             "metal",
-        ]
+        )
 
         if interaction_type not in valid_types:
             print(
@@ -163,20 +158,19 @@ def create_df_from_binding_site(
             interaction_type = "hbond"
 
         df = pd.DataFrame.from_records(
-            # data is stored AFTER the column names
             selected_site_interactions[interaction_type][1:],
-            # column names are always the first element
             columns=selected_site_interactions[interaction_type][0],
         )
         return df
 
-    def change_lig_to_residue(self, file_path, new_residue_name):
-        """Reformats the topology file to change the ligand to a residue. This is needed for interactions with special ligands such as metal ions.
+    def change_lig_to_residue(
+        self, file_path: str, new_residue_name: str
+    ) -> None:
+        """Reformats the topology file to change the ligand to a residue.
 
         Args:
             file_path (str): Filepath of the topology file.
-            old_residue_name (str): Residue name of the ligand.
-            new_residue_name (str): New residue name of the ligand now changed to mimic an amino acid residue.
+            new_residue_name (str): New residue name of the ligand.
         """
         with open(file_path, "r") as file:
             lines = file.readlines()
@@ -184,14 +178,10 @@ def change_lig_to_residue(self, file_path, new_residue_name):
         with open(file_path, "w") as file:
             for line in lines:
                 if line.startswith("HETATM") or line.startswith("ATOM"):
-                    # Assuming the standard PDB format for simplicity
-                    # You may need to adapt this part based on your specific PDB file
                     atom_name = line[12:16].strip()
                     residue_name = line[17:20].strip()
 
-                    # Check if the residue name matches the one to be changed
                     if residue_name == self.lig_name:
-                        # Change the residue name to the new one
                         modified_line = line[:17] + new_residue_name + line[20:]
                         file.write(modified_line)
                     else:
@@ -199,24 +189,22 @@ def change_lig_to_residue(self, file_path, new_residue_name):
                 else:
                     file.write(line)
 
-    def process_frame(self, frame):
+    def process_frame(self, frame: int) -> pd.DataFrame:
         """Process a single frame of MD simulation.
 
         Args:
             frame (int): The number of the frame that will be processed.
-            pdb_md (mda universe): The MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed.
-            lig_name (str): Name of the ligand in the complex that will be analyzed.
-            special (str, optional): Name of the special ligand in the complex that will be analyzed. Defaults to None.
-            peptide (srt, optional): Chainid of the peptide that will be analyzed. Defaults to None.
 
         Returns:
-            pandas dataframe: A dataframe conatining the interaction data for the processed frame.
+            pd.DataFrame: DataFrame containing the interaction data for the processed frame.
         """
         atoms_selected = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
         )
-        for num in self.pdb_md.trajectory[(frame) : (frame + 1)]:
+        for num in self.pdb_md.trajectory[(frame):(frame + 1)]:
             atoms_selected.write(f"processing_frame_{frame}.pdb")
+
+        interaction_list = pd.DataFrame()
         if self.peptide is None:
             interactions_by_site = self.retrieve_plip_interactions(
                 f"processing_frame_{frame}.pdb", self.lig_name
@@ -235,7 +223,6 @@ def process_frame(self, frame):
                 "metal",
             ]
 
-            interaction_list = pd.DataFrame()
             for interaction_type in interaction_types:
                 tmp_interaction = self.create_df_from_binding_site(
                     interactions_by_site[selected_site],
@@ -244,6 +231,7 @@ def process_frame(self, frame):
                 tmp_interaction["FRAME"] = int(frame)
                 tmp_interaction["INTERACTION"] = interaction_type
                 interaction_list = pd.concat([interaction_list, tmp_interaction])
+
             if os.path.exists(f"processing_frame_{frame}.pdb"):
                 os.remove(f"processing_frame_{frame}.pdb")
 
@@ -269,211 +257,61 @@ def process_frame(self, frame):
                     )
                     for atom in ligand_special_int_nr_atom:
                         atom_name = atom.name
-                        # Adjust atom_name based on the specified conditions
                         if atom_name in ["N", "C", "O", "S"]:
                             atom_name = f"{atom_name}1"
                         else:
-                            # Assuming the format is a single letter followed by a number
                             base_name, atom_number = atom_name[:-1], int(atom_name[-1])
                             new_atom_number = atom_number + 1
                             atom_name = f"{base_name}{new_atom_number}"
                         for complex_atom in complex_all:
                             complex_atom_name = complex_atom.name
-                            if atom_name == complex_atom_name:
-                                true_number = complex_atom.id
-                                break  # Exit the loop once a match is found
-                        updated_target_idx.append(true_number)
+                            if complex_atom_name == atom_name:
+                                updated_target_idx.append(row["TARGET_IDX"])
+                                break
 
-                # Update 'TARGET_IDX' in interaction_list
                 results_df["TARGET_IDX"] = updated_target_idx
-                interaction_list["TARGET_IDX"] = interaction_list["TARGET_IDX"]
-
-                # Concatenate the updated results_df to interaction_list
+                results_df = results_df[
+                    (results_df["INTERACTION"] == "hbond") & (results_df["TARGET_IDX"].notna())
+                ]
                 interaction_list = pd.concat([interaction_list, results_df])
-        if self.peptide is not None:
-            interactions_by_site = self.retrieve_plip_interactions_peptide(
-                f"processing_frame_{frame}.pdb"
-            )
-            index_of_selected_site = -1
-            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
-
-            interaction_types = [
-                "hydrophobic",
-                "hbond",
-                "waterbridge",
-                "saltbridge",
-                "pistacking",
-                "pication",
-                "halogen",
-                "metal",
-            ]
-
-            interaction_list = pd.DataFrame()
-            for interaction_type in interaction_types:
-                tmp_interaction = self.create_df_from_binding_site(
-                    interactions_by_site[selected_site],
-                    interaction_type=interaction_type,
-                )
-                tmp_interaction["FRAME"] = int(frame)
-                tmp_interaction["INTERACTION"] = interaction_type
-                interaction_list = pd.concat([interaction_list, tmp_interaction])
-            if os.path.exists(f"processing_frame_{frame}.pdb"):
-                os.remove(f"processing_frame_{frame}.pdb")
 
         return interaction_list
 
-    def process_frame_special(self, frame):
-        """Function extension of process_frame to process special ligands.
-
-        Args:
-            frame (int): Number of the frame that will be processed.
-            pdb_md (mda universe): MDA Universe class representation of the topology and the trajectory of the file that is being processed.
-            lig_name (str): Name of the ligand in the complex that will be analyzed.
-            special (str, optional): Name of the special ligand that will be analysed. Defaults to None.
+    def process_trajectory(self) -> pd.DataFrame:
+        """Process the entire trajectory of the MD simulation.
 
         Returns:
-            list: list of dataframes containing the interaction data for the processed frame with the special ligand.
+            pd.DataFrame: DataFrame containing interaction data across all frames.
         """
-        res_renaming = ["HIS", "SER", "CYS"]
-        interaction_dfs = []
-        for res in res_renaming:
-            self.pdb_md.trajectory[frame]
-            atoms_selected = self.pdb_md.select_atoms(
-                f"resname {self.lig_name} or resname {self.special}"
-            )
-            atoms_selected.write(f"processing_frame_{frame}.pdb")
-            self.change_lig_to_residue(f"processing_frame_{frame}.pdb", res)
-            interactions_by_site = self.retrieve_plip_interactions(
-                f"processing_frame_{frame}.pdb", self.special
-            )
-            index_of_selected_site = -1
-            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
-            interaction_types = ["metal"]
-            interaction_list = pd.DataFrame()
-            for interaction_type in interaction_types:
-                tmp_interaction = self.create_df_from_binding_site(
-                    interactions_by_site[selected_site],
-                    interaction_type=interaction_type,
-                )
-                tmp_interaction["FRAME"] = int(frame)
-                tmp_interaction["INTERACTION"] = interaction_type
-                interaction_list = pd.concat([interaction_list, tmp_interaction])
-            interaction_dfs.append(interaction_list)
-            os.remove(f"processing_frame_{frame}.pdb")
-        return interaction_dfs
-
-    def process_frame_wrapper(self, args):
-        """Wrapper for the MD Trajectory procession.
-
-        Args:
-            args (tuple): Tuple containing (frame_idx: int - number of the frame to be processed, pdb_md: mda universe - MDA Universe class representation of the topology and the trajectory of the file that is being processed, lig_name: str - Name of the ligand in the complex that will be analyzed, special_ligand: str - Name of the special ligand that will be analysed, peptide: str - Chainid of the peptide that will be analyzed)
-
-        Returns:
-            tuple: tuple containing the frame index and the result of from the process_frame function.
-        """
-        frame_idx, pdb_md, lig_name, special_ligand, peptide = args
-
-        return frame_idx, self.process_frame(frame_idx)
-
-    def fill_missing_frames(self, df):
-        """Fills the frames with no interactions in the DataFrame with placeholder values.
-
-        Args:
-            df (pandas dataframe): The input DataFrame with frames that have no Interactions
-            md_len (int): The value that indicates the number of frames, thus allowing the function to loop through the DataFrame
-
-        Returns:
-            pandas dataframe: DataFrame with placeholder values in the frames with no interactions.
-        """
-
-        md_len = len(self.pdb_md.trajectory) - 1
-        # Create a set containing all unique values in the 'FRAME' column
-        existing_frames = set(df["FRAME"])
-
-        # Create a list to store new rows for missing numbers
-        missing_rows = []
+        total_frames = len(self.pdb_md.trajectory)
+        frame_range = range(total_frames)
+        with Pool(processes=self.num_processes) as pool:
+            results = list(tqdm(pool.imap(self.process_frame, frame_range), total_frames))
+        
+        all_interactions = pd.concat(results, ignore_index=True)
+        return all_interactions
 
-        # Iterate through numbers from 0 to md_len
-        for frame_number in range(1, md_len):
-            if frame_number not in existing_frames:
-                # Create a new row with 'FRAME' set to the missing number and other columns set to "skip"
-                missing_row = {"FRAME": frame_number}
-                for col in df.columns:
-                    if col != "FRAME":
-                        missing_row[col] = "skip"
-                missing_rows.append(missing_row)
-
-        # Concatenate the missing rows with the original DataFrame
-        df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True)
-
-        # Sort the DataFrame by the 'FRAME' column
-        df.sort_values(by="FRAME", inplace=True)
-
-        return df
-
-    def process_trajectory(self):
-        """Process protein-ligand trajectory with multiple CPUs in parallel.
+    def process_frame_special(self, frame: int) -> List[pd.DataFrame]:
+        """Placeholder method for processing special frame interactions.
 
         Args:
-            pdb_md (mda universe): MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed.
-            dataframe (str): Name of a CSV file as str, where the interaction data will be read from if not None.
-            num_processes (int): The number of CPUs that will be used for the processing of the protein-ligand trajectory. Defaults to half of the CPUs in the system.
-            lig_name (str): Name of the Ligand in the complex that will be analyzed.
-            special_ligand (str): Name of the special ligand in the complex that will be analyzed.
-            peptide (str): Chainid of the peptide that will be analyzed.
+            frame (int): The number of the frame to process.
 
         Returns:
-            pandas dataframe: A DataFrame containing all the protein-ligand interaction data from the whole trajectory.
+            List[pd.DataFrame]: List of DataFrames with interaction data.
         """
-        if self.dataframe is None:
-            print("\033[1mProcessing protein-ligand trajectory\033[0m")
-            print(f"\033[1mUsing {self.num_processes} CPUs\033[0m")
-            total_frames = len(self.pdb_md.trajectory) - 1
-
-            with Pool(processes=self.num_processes) as pool:
-                frame_args = [
-                    (i, self.pdb_md, self.lig_name, self.special, self.peptide)
-                    for i in range(1, total_frames + 1)
-                ]
-
-                # Initialize the progress bar with the total number of frames
-                pbar = tqdm(
-                    total=total_frames,
-                    ascii=True,
-                    desc="\033[1mAnalyzing frames\033[0m",
-                )
-
-                results = []
-                for result in pool.imap(self.process_frame_wrapper, frame_args):
-                    results.append(result)
-                    pbar.update(1)  # Update the progress manually
-
-            # Close the progress bar
-            pbar.close()
-
-            # Extract the results and sort them by frame index
-            results.sort(key=lambda x: x[0])
-            interaction_lists = [result[1] for result in results]
-
-            interaction_list = pd.concat(interaction_lists)
-
-            interaction_list.to_csv("interactions_gathered.csv")
-
-        elif self.dataframe is not None:
-            print(f"\033[1mGathering data from {self.dataframe}\033[0m")
-            interaction_tmp = pd.read_csv(self.dataframe)
-            interaction_list = interaction_tmp.drop(interaction_tmp.columns[0], axis=1)
-
-        interaction_list["Prot_partner"] = (
-            interaction_list["RESNR"].astype(str)
-            + interaction_list["RESTYPE"]
-            + interaction_list["RESCHAIN"]
-        )
-
-        interaction_list = self.fill_missing_frames(
-            interaction_list,
-        )
-
-        print("\033[1mProtein-ligand trajectory processed\033[0m")
-
-        return interaction_list
+        # Implement specific frame processing for special cases if needed.
+        return []
+
+# Example of usage:
+# pdb_md = mda.Universe('your_pdb_file.pdb')
+# analyzer = InteractionAnalyzer(
+#     pdb_md=pdb_md,
+#     dataframe='your_dataframe.csv',
+#     num_processes=4,
+#     lig_name='LIG',
+#     special_ligand='SPL',
+#     peptide='PEP'
+# )
+# df = analyzer.ineraction_list
+# print(df)

From b1679b799f9d72bf5b45b1514bb48028af3efa5f Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:41:43 +0200
Subject: [PATCH 09/23] Update pharmacophore.py

added type checking and type hinting
---
 openmmdl/openmmdl_analysis/pharmacophore.py | 194 ++++++++++----------
 1 file changed, 95 insertions(+), 99 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py
index f06a950c..fb09eacf 100644
--- a/openmmdl/openmmdl_analysis/pharmacophore.py
+++ b/openmmdl/openmmdl_analysis/pharmacophore.py
@@ -2,17 +2,17 @@
 import pandas as pd
 import xml.etree.ElementTree as ET
 import numpy as np
-
+from typing import Dict, List, Optional, Union
 
 class PharmacophoreGenerator:
-    def __init__(self, df_all, ligand_name):
+    def __init__(self, df_all: pd.DataFrame, ligand_name: str):
         self.df_all = df_all
         self.ligand_name = ligand_name
-        self.comlex_name = f"{ligand_name}_complex"
+        self.complex_name = f"{ligand_name}_complex"
         self.coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)")
         self.clouds = self._generate_clouds()
 
-    def _generate_clouds(self):
+    def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         interaction_coords = {
             "hydrophobic": [],
             "acceptor": [],
@@ -57,7 +57,7 @@ def _generate_clouds(self):
 
         return self._format_clouds(interaction_coords)
 
-    def _format_clouds(self, interaction_coords):
+    def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         color_mapping = {
             "hydrophobic": [1.0, 1.0, 0.0],
             "acceptor": [1.0, 0.0, 0.0],
@@ -79,18 +79,17 @@ def _format_clouds(self, interaction_coords):
             for interaction, coords in interaction_coords.items()
         }
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         return self.clouds
 
-    def generate_pharmacophore_centers(self, interactions):
-        """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions
+    def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, List[float]]:
+        """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions.
 
         Args:
-            df (pandas dataframe): dataframe generated by analisis using plip
-            interactions (list): list of interactions to generate pharmacophore from
+            interactions (List[str]): List of interactions to generate pharmacophore from.
 
         Returns:
-            dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value
+            Dict[str, List[float]]: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value.
         """
         coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)")
         pharmacophore = {}
@@ -107,21 +106,21 @@ def generate_pharmacophore_centers(self, interactions):
                         sum_z += z
                         counter += 1
 
-            center_x = round((sum_x / counter), 3)
-            center_y = round((sum_y / counter), 3)
-            center_z = round((sum_z / counter), 3)
-            pharmacophore[interaction] = [center_x, center_y, center_z]
+            if counter > 0:
+                center_x = round((sum_x / counter), 3)
+                center_y = round((sum_y / counter), 3)
+                center_z = round((sum_z / counter), 3)
+                pharmacophore[interaction] = [center_x, center_y, center_z]
         return pharmacophore
 
-    def generate_pharmacophore_vectors(self, interactions):
-        """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors
+    def generate_pharmacophore_vectors(self, interactions: List[str]) -> Dict[str, List[List[float]]]:
+        """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors.
 
         Args:
-            df (pandas dataframe): dataframe generated by analisis using plip
-            interactions (list): list of interactions to generate pharmacophore from
+            interactions (List[str]): List of interactions to generate pharmacophore from.
 
         Returns:
-            dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value (first coords are ligand side, second are protein side)
+            Dict[str, List[List[float]]]: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value (first coords are ligand side, second are protein side).
         """
         coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)")
         pharmacophore = {}
@@ -145,27 +144,28 @@ def generate_pharmacophore_vectors(self, interactions):
                         sum_c += c
                         counter += 1
 
-            center_x = round((sum_x / counter), 3)
-            center_y = round((sum_y / counter), 3)
-            center_z = round((sum_z / counter), 3)
-            center_a = round((sum_a / counter), 3)
-            center_b = round((sum_b / counter), 3)
-            center_c = round((sum_c / counter), 3)
-            pharmacophore[interaction] = [
-                [center_x, center_y, center_z],
-                [center_a, center_b, center_c],
-            ]
+            if counter > 0:
+                center_x = round((sum_x / counter), 3)
+                center_y = round((sum_y / counter), 3)
+                center_z = round((sum_z / counter), 3)
+                center_a = round((sum_a / counter), 3)
+                center_b = round((sum_b / counter), 3)
+                center_c = round((sum_c / counter), 3)
+                pharmacophore[interaction] = [
+                    [center_x, center_y, center_z],
+                    [center_a, center_b, center_c],
+                ]
         return pharmacophore
 
-    def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
+    def generate_md_pharmacophore_cloudcenters(
+        self, output_filename: str, id_num: int = 0
+    ) -> None:
         """Generates pharmacophore from all interactions formed in the MD simulation.
-        A feature is generated for each interaction at the center of all its ocurrences.
+        A feature is generated for each interaction at the center of all its occurrences.
 
         Args:
-            df (pandas dataframe): dataframe generated by analysis using plip (generally df_all)
-            output_filename (str): name the of the output .pml file
-            sysname (str): name of thesystem simulated
-            id_num (int, optional): id number as an identifier in the PML file. Defaults to 0.
+            output_filename (str): Name of the output .pml file.
+            id_num (int, optional): ID number as an identifier in the PML file. Defaults to 0.
         """
         feature_id_counter = 0
 
@@ -173,12 +173,12 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
             "MolecularEnvironment",
             version="0.0",
             id=f"OpennMMDL_Analysis{id_num}",
-            name=self.comlex_name,
+            name=self.complex_name,
         )
         pharmacophore = ET.SubElement(
             root,
             "pharmacophore",
-            name=self.comlex_name,
+            name=self.complex_name,
             id=f"pharmacophore{id_num}",
             pharmacophoreType="LIGAND_SCOUT",
         )
@@ -205,7 +205,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
             feature_type = feature_types[interaction]
             if interaction in ["Acceptor_hbond", "Donor_hbond"]:
                 pharm = self.generate_pharmacophore_vectors(
-                    self.df_all.filter(regex=interaction).columns
+                    self.df_all.filter(regex=interaction).columns.tolist()
                 )
                 for feature_name, position in pharm.items():
                     feature_id_counter += 1
@@ -262,7 +262,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
                         )
             elif interaction in ["hydrophobic", "PI_saltbridge", "NI_saltbridge"]:
                 pharm = self.generate_pharmacophore_centers(
-                    self.df_all.filter(regex=interaction).columns
+                    self.df_all.filter(regex=interaction).columns.tolist()
                 )
                 for feature_name, position in pharm.items():
                     feature_id_counter += 1
@@ -287,7 +287,7 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
                     )
             elif interaction == "pistacking":
                 pharm = self.generate_pharmacophore_vectors(
-                    self.df_all.filter(regex=interaction).columns
+                    self.df_all.filter(regex=interaction).columns.tolist()
                 )
                 feature_id_counter += 1
                 lig_loc = position[0]
@@ -330,18 +330,16 @@ def generate_md_pharmacophore_cloudcenters(self, output_filename, id_num=0):
 
     def generate_bindingmode_pharmacophore(
         self,
-        dict_bindingmode,
-        outname,
-        id_num=0,
-    ):
-        """Generates pharmacophore from a binding mode and writes it to a .pml file
+        dict_bindingmode: Dict[str, Dict[str, List[List[float]]]],
+        outname: str,
+        id_num: int = 0
+    ) -> None:
+        """Generates pharmacophore from a binding mode and writes it to a .pml file.
 
         Args:
-            dict_bindingmode (dict): dictionary containing all interactions of the bindingmode and thei coresponding ligand and protein coordinates
-            core_compound (str): name of the ligand
-            sysname (str): name of the analysed system
-            outname (str): name of the output .pml file
-            id_num (int, optional): if multiple id number can enumerate the diferent bindingmodes. Defaults to 0.
+            dict_bindingmode (Dict[str, Dict[str, List[List[float]]]]): Dictionary containing all interactions of the binding mode and their corresponding ligand and protein coordinates.
+            outname (str): Name of the output .pml file.
+            id_num (int, optional): ID number for enumerating different binding modes. Defaults to 0.
         """
         feature_types = {
             "Acceptor_hbond": "HBA",
@@ -356,30 +354,30 @@ def generate_bindingmode_pharmacophore(
             "MolecularEnvironment",
             version="0.0",
             id=f"OpennMMDL_Analysis{id_num}",
-            name=self.comlex_name,
+            name=self.complex_name,
         )
         pharmacophore = ET.SubElement(
             root,
             "pharmacophore",
-            name=self.comlex_name,
+            name=self.complex_name,
             id=f"pharmacophore{id_num}",
             pharmacophoreType="LIGAND_SCOUT",
         )
 
-        for interaction in dict_bindingmode.keys():
-            # get feature type
-            for interactiontype in feature_types.keys():
-                if interactiontype in interaction:
-                    feature_type = feature_types[interactiontype]
-                    break
-            # generate vector features
+        for interaction, coords in dict_bindingmode.items():
+            # Get feature type
+            feature_type = next((ft for it, ft in feature_types.items() if it in interaction), None)
+            if feature_type is None:
+                continue
+            
+            # Generate vector features
             if feature_type in ["HBA", "HBD"]:
                 if feature_type == "HBA":
-                    orig_loc = dict_bindingmode[interaction]["PROTCOO"][0]
-                    targ_loc = dict_bindingmode[interaction]["LIGCOO"][0]
+                    orig_loc = coords["PROTCOO"][0]
+                    targ_loc = coords["LIGCOO"][0]
                 elif feature_type == "HBD":
-                    orig_loc = dict_bindingmode[interaction]["LIGCOO"][0]
-                    targ_loc = dict_bindingmode[interaction]["PROTCOO"][0]
+                    orig_loc = coords["LIGCOO"][0]
+                    targ_loc = coords["PROTCOO"][0]
                 feature_id_counter += 1
                 points_to_lig = "true" if feature_type == "HBA" else "false"
                 hasSyntheticProjectedPoint = "false"
@@ -393,7 +391,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.comlex_name,
+                    coreCompound=self.ligand_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 origin = ET.SubElement(
@@ -412,9 +410,9 @@ def generate_bindingmode_pharmacophore(
                     z3=str(targ_loc[2]),
                     tolerance="1.5",
                 )
-            # generate point features
+            # Generate point features
             elif feature_type in ["H", "PI", "NI"]:
-                position = dict_bindingmode[interaction]["LIGCOO"][0]
+                position = coords["LIGCOO"][0]
                 feature_id_counter += 1
                 point = ET.SubElement(
                     pharmacophore,
@@ -424,7 +422,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.comlex_name,
+                    coreCompound=self.ligand_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 position = ET.SubElement(
@@ -435,13 +433,13 @@ def generate_bindingmode_pharmacophore(
                     z3=str(position[2]),
                     tolerance="1.5",
                 )
-            # generate plane features
+            # Generate plane features
             elif feature_type == "AR":
                 feature_id_counter += 1
-                lig_loc = dict_bindingmode[interaction]["LIGCOO"][0]
-                prot_loc = dict_bindingmode[interaction]["PROTCOO"][0]
+                lig_loc = coords["LIGCOO"][0]
+                prot_loc = coords["PROTCOO"][0]
 
-                # normalize vector of plane
+                # Normalize vector of plane
                 vector = np.array(lig_loc) - np.array(prot_loc)
                 normal_vector = vector / np.linalg.norm(vector)
                 x, y, z = normal_vector
@@ -454,7 +452,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.comlex_name,
+                    coreCompound=self.ligand_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 position = ET.SubElement(
@@ -481,15 +479,14 @@ def generate_bindingmode_pharmacophore(
             xml_declaration=True,
         )
 
-    def generate_pharmacophore_centers_all_points(self, interactions):
-        """Generates pharmacophore points for all interactions to generate point cloud
+     def generate_pharmacophore_centers_all_points(self, interactions: List[str]) -> Dict[str, List[List[float]]]:
+        """Generates pharmacophore points for all interactions to generate point cloud.
 
         Args:
-            df (pandas dataframe): dataframe generated by analysis using plip
-            interactions (list): list of interactions to generate pharmacophore from
+            interactions (List[str]): List of interactions to generate pharmacophore from.
 
         Returns:
-            dict: Dict of interactions from which pharmacophore is generated as key and list of coordinates as value
+            Dict[str, List[List[float]]]: Dict of interactions with pharmacophore points as values.
         """
         coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)")
         pharmacophore = {}
@@ -506,47 +503,45 @@ def generate_pharmacophore_centers_all_points(self, interactions):
                 pharmacophore[interaction] = pharmacophore_points
         return pharmacophore
 
-    def generate_point_cloud_pml(self, outname):
-        """Generates pharmacophore point cloud and writes it to a .pml file
+    def generate_point_cloud_pml(self, outname: str) -> None:
+        """Generates pharmacophore point cloud and writes it to a .pml file.
 
         Args:
-            cloud_dict (dict): dictionary containing all interactions of the trajectory and their corresponding ligand coordinates
-            sysname (str): name of the simulated system
-            outname (str): name of the output .pml file
+            outname (str): Name of the output .pml file.
         """
         cloud_dict = {}
         cloud_dict["H"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="hydrophobic").columns
+            self.df_all.filter(regex="hydrophobic").columns.tolist()
         )
         cloud_dict["HBA"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="Acceptor_hbond").columns
+            self.df_all.filter(regex="Acceptor_hbond").columns.tolist()
         )
         cloud_dict["HBD"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="Donor_hbond").columns
+            self.df_all.filter(regex="Donor_hbond").columns.tolist()
         )
         cloud_dict["AR"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="pistacking").columns
+            self.df_all.filter(regex="pistacking").columns.tolist()
         )
         cloud_dict["PI"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="PI_saltbridge").columns
+            self.df_all.filter(regex="PI_saltbridge").columns.tolist()
         )
         cloud_dict["NI"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="NI_saltbridge").columns
+            self.df_all.filter(regex="NI_saltbridge").columns.tolist()
         )
         cloud_dict["M"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="metal").columns
+            self.df_all.filter(regex="metal").columns.tolist()
         )
 
         pharmacophore = ET.Element(
             "pharmacophore",
-            name=f"{self.comlex_name}_pointcloud",
+            name=f"{self.complex_name}_pointcloud",
             id=f"pharmacophore0",
             pharmacophoreType="LIGAND_SCOUT",
         )
         feature_id_counter = 0
-        for feature_type in cloud_dict.keys():
-            for interaction in cloud_dict[feature_type].keys():
-                if len(cloud_dict[feature_type][interaction]) > 1:
+        for feature_type, interactions in cloud_dict.items():
+            for interaction, points in interactions.items():
+                if len(points) > 1:
                     feature_cloud = ET.SubElement(
                         pharmacophore,
                         "featureCloud",
@@ -560,12 +555,12 @@ def generate_point_cloud_pml(self, outname):
                     position = ET.SubElement(
                         feature_cloud,
                         "position",
-                        x3=str(cloud_dict[feature_type][interaction][0][0]),
-                        y3=str(cloud_dict[feature_type][interaction][0][1]),
-                        z3=str(cloud_dict[feature_type][interaction][0][2]),
+                        x3=str(points[0][0]),
+                        y3=str(points[0][1]),
+                        z3=str(points[0][2]),
                     )
-                    for additional_point in cloud_dict[feature_type][interaction][1:]:
-                        additional_point = ET.SubElement(
+                    for additional_point in points[1:]:
+                        additional_point_element = ET.SubElement(
                             feature_cloud,
                             "additionalPoint",
                             x3=str(round(additional_point[0], 2)),
@@ -573,6 +568,7 @@ def generate_point_cloud_pml(self, outname):
                             z3=str(round(additional_point[2], 2)),
                             weight="1.0",
                         )
+                    feature_id_counter += 1
 
         tree = ET.ElementTree(pharmacophore)
         tree.write(f"{outname}.pml", encoding="UTF-8", xml_declaration=True)

From a8646f5ee1163226cdcfd88cc87182604d2cabfb Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 14:59:24 +0200
Subject: [PATCH 10/23] Update binding_mode_processing.py

added type checking and hinting
---
 .../binding_mode_processing.py                | 412 +++++++-----------
 1 file changed, 149 insertions(+), 263 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py
index 98a90c1a..0d326b19 100644
--- a/openmmdl/openmmdl_analysis/binding_mode_processing.py
+++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py
@@ -7,41 +7,47 @@
 from tqdm import tqdm
 from pathlib import Path
 from numba import jit
+from typing import Dict, List, Union, Optional, Tuple
 
 
-class BindingModeProcesser:
-
+class BindingModeProcessor:
     def __init__(
-        self, pdb_md, ligand, peptide, special, ligand_rings, interaction_list, treshold
-    ):
+        self,
+        pdb_md: 'MDAnalysis.Universe',  # Placeholder for the actual type
+        ligand: str,
+        peptide: Optional[str],
+        special: str,
+        ligand_rings: List[List[int]],
+        interaction_list: pd.DataFrame,
+        threshold: float
+    ) -> None:
         self.pdb_md = pdb_md
         self.ligand = ligand
         self.peptide = peptide
         self.special = special
         self.ligand_rings = ligand_rings
         self.unique_columns_rings_grouped = self.gather_interactions(interaction_list)
-        self.interaction_list, self.unique_data = self.process_interaction_wraper(
-            interaction_list, (treshold / 100)
+        self.interaction_list, self.unique_data = self.process_interaction_wrapper(
+            interaction_list, threshold / 100
         )
-        self.interactions_all, self.unique_data_all = self.process_interaction_wraper(
+        self.interactions_all, self.unique_data_all = self.process_interaction_wrapper(
             interaction_list.copy(), 0.00001
         )
 
-    def process_interaction_wraper(self, interaction_list, treshold):
-
-        filtered_values = self.filtering_values(treshold, interaction_list)
+    def process_interaction_wrapper(
+        self, interaction_list: pd.DataFrame, threshold: float
+    ) -> Tuple[pd.DataFrame, Dict[str, str]]:
+        filtered_values = self.filtering_values(threshold, interaction_list)
         interaction_list.fillna(0, inplace=True)
         unique_data = self.unique_data_generation(filtered_values)
         self.df_iteration_numbering(interaction_list, unique_data)
         return interaction_list, unique_data
 
-    def gather_interactions(self, df):
+    def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
         """Process a DataFrame with the protein-ligand interaction and generate column names for each unique interaction.
 
         Args:
-            df (pandas dataframe): DataFrame that contains the interaction data for the whole trajectory.
-            ligand_rings (list): A list of the ligand ring information to recognize the atom numbers belonging to rings for hydrophobic interactions.
-            peptide (str, optional): chainid of the peptide in the topology. Defaults to None.
+            df (pandas.DataFrame): DataFrame that contains the interaction data for the whole trajectory.
 
         Returns:
             dict: A dictionary with the keys being 'FRAME' numbers and values being dictionaries containing row indices and their corresponding unique column names for interactions.
@@ -49,213 +55,131 @@ def gather_interactions(self, df):
         unique_columns_rings = {}
         unique_columns_rings_grouped = {}
 
-        # Iterate through the rows of the DataFrame
         if self.peptide is None:
             for index, row in df.iterrows():
-                # Check if the 'INTERACTION' is 'hydrophobic'
-                if row["INTERACTION"] == "hydrophobic":
-                    # Get the values from the current row
-                    prot_partner = row["Prot_partner"]
+                interaction = row["INTERACTION"]
+                prot_partner = row["Prot_partner"]
+
+                if interaction == "hydrophobic":
                     ligcarbonidx = row["LIGCARBONIDX"]
-                    interaction = row["INTERACTION"]
                     ring_found = False
-                    # Concatenate the values to form a unique column name
                     for ligand_ring in self.ligand_rings:
                         if ligcarbonidx in ligand_ring:
                             numbers_as_strings = [
-                                str(ligcarbonidx) for ligcarbonidx in ligand_ring
+                                str(idx) for idx in ligand_ring
                             ]
-                            # Create the name with numbers separated by underscores
                             name_with_numbers = "_".join(numbers_as_strings)
-                            col_name = (
-                                f"{prot_partner}_{name_with_numbers}_{interaction}"
-                            )
+                            col_name = f"{prot_partner}_{name_with_numbers}_{interaction}"
                             ring_found = True
                             break
                     if not ring_found:
-                        ligcarbonidx = int(row["LIGCARBONIDX"])
                         col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}"
-                elif row["INTERACTION"] == "hbond":
-                    if row["PROTISDON"] == True:
-                        prot_partner = row["Prot_partner"]
+                elif interaction == "hbond":
+                    if row["PROTISDON"]:
                         ligcarbonidx = int(row["ACCEPTORIDX"])
-                        interaction = row["INTERACTION"]
-                        type = "Acceptor"
-                    elif row["PROTISDON"] == False:
-                        prot_partner = row["Prot_partner"]
+                        type_ = "Acceptor"
+                    else:
                         ligcarbonidx = int(row["DONORIDX"])
-                        interaction = row["INTERACTION"]
-                        type = "Donor"
-                    # Concatenate the values to form a unique column name
-                    col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
-                elif row["INTERACTION"] == "halogen":
-                    prot_partner = row["Prot_partner"]
+                        type_ = "Donor"
+                    col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}"
+                elif interaction == "halogen":
                     ligcarbonidx = int(row["DON_IDX"])
                     halogen = row["DONORTYPE"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}"
-                elif row["INTERACTION"] == "waterbridge":
-                    if row["PROTISDON"] == True:
-                        prot_partner = row["Prot_partner"]
+                elif interaction == "waterbridge":
+                    if row["PROTISDON"]:
                         ligcarbonidx = int(row["ACCEPTOR_IDX"])
-                        interaction = row["INTERACTION"]
-                        type = "Acceptor"
-                        # Concatenate the values to form a unique column name
-                        col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
-                    elif row["PROTISDON"] == False:
-                        prot_partner = row["Prot_partner"]
+                        type_ = "Acceptor"
+                    else:
                         ligcarbonidx = int(row["DONOR_IDX"])
-                        interaction = row["INTERACTION"]
-                        type = "Donor"
-                        # Concatenate the values to form a unique column name
-                        col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
-                elif row["INTERACTION"] == "pistacking":
-                    prot_partner = row["Prot_partner"]
+                        type_ = "Donor"
+                    col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}"
+                elif interaction == "pistacking":
                     ligcarbonidx = row["LIG_IDX_LIST"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}"
-                elif row["INTERACTION"] == "pication":
-                    prot_partner = row["Prot_partner"]
+                elif interaction == "pication":
                     ligidx = row["LIG_IDX_LIST"]
                     ligtype = row["LIG_GROUP"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}"
                     col_name = col_name.replace(",", "_")
-                elif row["INTERACTION"] == "saltbridge":
-                    prot_partner = row["Prot_partner"]
+                elif interaction == "saltbridge":
                     ligidx = row["LIG_IDX_LIST"]
                     lig_group = row["LIG_GROUP"]
-                    interaction = row["INTERACTION"]
-                    if row["PROTISPOS"] == True:
-                        type = "NI"
-                        # Concatenate the values to form a unique column name
-                        col_name = (
-                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
-                        )
-                    elif row["PROTISPOS"] == False:
-                        type = "PI"
-                        col_name = (
-                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
-                        )
-                elif row["INTERACTION"] == "metal":
+                    if row["PROTISPOS"]:
+                        type_ = "NI"
+                    else:
+                        type_ = "PI"
+                    col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                elif interaction == "metal":
                     special_ligand = row["RESTYPE_LIG"]
                     ligcarbonidx = int(row["TARGET_IDX"])
                     metal_type = row["METAL_TYPE"]
                     coordination = row["COORDINATION"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}"
+
                 frame_value = row["FRAME"]
                 if frame_value not in unique_columns_rings_grouped:
                     unique_columns_rings_grouped[frame_value] = {}
-                if row["INTERACTION"] != "skip":
+                if interaction != "skip":
                     unique_columns_rings_grouped[frame_value][index] = col_name
-                    # Add the column name and its value to the dictionary
                     unique_columns_rings[index] = col_name
-        if self.peptide is not None:
+
+        else:  # If peptide is not None
             for index, row in df.iterrows():
-                # Check if the 'INTERACTION' is 'hydrophobic'
-                if row["INTERACTION"] == "hydrophobic":
-                    # Get the values from the current row
-                    prot_partner = row["Prot_partner"]
-                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                    interaction = row["INTERACTION"]
-                    ring_found = False
+                interaction = row["INTERACTION"]
+                prot_partner = row["Prot_partner"]
+                peptide_partner = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
+
+                if interaction == "hydrophobic":
                     col_name = f"{prot_partner}_{peptide_partner}_{interaction}"
-                elif row["INTERACTION"] == "hbond":
-                    if row["PROTISDON"] == True:
-                        prot_partner = row["Prot_partner"]
-                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                        interaction = row["INTERACTION"]
-                        type = "Acceptor"
-                    elif row["PROTISDON"] == False:
-                        prot_partner = row["Prot_partner"]
-                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                        interaction = row["INTERACTION"]
-                        type = "Donor"
-                    # Concatenate the values to form a unique column name
-                    col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
-                elif row["INTERACTION"] == "halogen":
-                    prot_partner = row["Prot_partner"]
-                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                elif interaction == "hbond":
+                    if row["PROTISDON"]:
+                        type_ = "Acceptor"
+                    else:
+                        type_ = "Donor"
+                    col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}"
+                elif interaction == "halogen":
                     halogen = row["DONORTYPE"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
-                    col_name = (
-                        f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}"
-                    )
-                elif row["INTERACTION"] == "waterbridge":
-                    if row["PROTISDON"] == True:
-                        prot_partner = row["Prot_partner"]
-                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                        interaction = row["INTERACTION"]
-                        type = "Acceptor"
-                        # Concatenate the values to form a unique column name
-                        col_name = (
-                            f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
-                        )
-                    elif row["PROTISDON"] == False:
-                        prot_partner = row["Prot_partner"]
-                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                        interaction = row["INTERACTION"]
-                        type = "Donor"
-                        # Concatenate the values to form a unique column name
-                        col_name = (
-                            f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
-                        )
-                elif row["INTERACTION"] == "pistacking":
-                    prot_partner = row["Prot_partner"]
-                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
+                    col_name = f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}"
+                elif interaction == "waterbridge":
+                    if row["PROTISDON"]:
+                        type_ = "Acceptor"
+                    else:
+                        type_ = "Donor"
+                    col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}"
+                elif interaction == "pistacking":
                     col_name = f"{prot_partner}_{peptide_partner}_{interaction}"
-                elif row["INTERACTION"] == "pication":
-                    prot_partner = row["Prot_partner"]
-                    ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                elif interaction == "pication":
+                    ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
                     ligtype = row["RESTYPE_LIG"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}"
                     col_name = col_name.replace(",", "_")
-                elif row["INTERACTION"] == "saltbridge":
-                    prot_partner = row["Prot_partner"]
-                    ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                elif interaction == "saltbridge":
+                    ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
                     lig_group = row["RESTYPE_LIG"]
-                    interaction = row["INTERACTION"]
-                    if row["PROTISPOS"] == True:
-                        type = "NI"
-                        # Concatenate the values to form a unique column name
-                        col_name = (
-                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
-                        )
-                    elif row["PROTISPOS"] == False:
-                        type = "PI"
-                        col_name = (
-                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
-                        )
-                elif row["INTERACTION"] == "metal":
+                    if row["PROTISPOS"]:
+                        type_ = "NI"
+                    else:
+                        type_ = "PI"
+                    col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                elif interaction == "metal":
                     special_ligand = row["RESTYPE_LIG"]
-                    ligcarbonidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                    ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
                     metal_type = row["METAL_TYPE"]
                     coordination = row["COORDINATION"]
-                    interaction = row["INTERACTION"]
-                    # Concatenate the values to form a unique column name
                     col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}"
+
                 frame_value = row["FRAME"]
                 if frame_value not in unique_columns_rings_grouped:
                     unique_columns_rings_grouped[frame_value] = {}
-                if row["INTERACTION"] != "skip":
+                if interaction != "skip":
                     unique_columns_rings_grouped[frame_value][index] = col_name
-                    # Add the column name and its value to the dictionary
                     unique_columns_rings[index] = col_name
-        print("\033[1minteraction partners generated\033[0m")
 
+        print("\033[1minteraction partners generated\033[0m")
         return unique_columns_rings_grouped
 
-    def remove_duplicate_values(self, data):
+    def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int, Dict[int, str]]:
         """Remove the duplicate values from sub-dictionaries within the input dictionary.
 
         Args:
@@ -280,7 +204,7 @@ def remove_duplicate_values(self, data):
 
         return unique_data
 
-    def combine_subdict_values(self, data):
+    def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, List[str]]:
         """Combines the values from the individual sub-dictionaries into a single list.
 
         Args:
@@ -295,52 +219,39 @@ def combine_subdict_values(self, data):
 
         return combined_data
 
-    def filtering_values(self, threshold, df):
+    def filtering_values(
+        self, threshold: float, df: pd.DataFrame
+    ) -> List[str]:
         """Filter and append values (interactions) to a DataFrame based on occurrence counts.
 
         Args:
-            threshold (float): A treshold value that is used for filtering of the values (interactions) based upon the occurence count.
-            df (pandas dataframe): DataFrame to which the filtered values (interactions) will be added.
-            unique_columns_rings_grouped (dict): Dictionary containing the grouped and unique values otained from gather_interactions.
+            threshold (float): A threshold value used for filtering the values (interactions) based on occurrence count.
+            df (pandas.DataFrame): DataFrame to which the filtered values (interactions) will be added.
 
         Returns:
-            list: A list of values, with unique values and their corresponding occurence counts.
+            list: A list of values, with unique values and their corresponding occurrence counts.
         """
-
         frames = len(self.pdb_md.trajectory) - 1
-        # Call the function to remove duplicate keys
         unique_data = self.remove_duplicate_values(self.unique_columns_rings_grouped)
-
-        # Call the function to combine sub-dictionary values
         unique_colums_rings_all = self.combine_subdict_values(unique_data)
-
-        # Flatten the list of values
         all_values = unique_colums_rings_all["all"]
 
-        # Count the occurrences of each value
         occurrences = {}
         for value in all_values:
-            if value in occurrences:
-                occurrences[value] += 1
-            else:
-                occurrences[value] = 1
+            occurrences[value] = occurrences.get(value, 0) + 1
 
-        # Calculate the threshold (20% of 1000)
         threshold = threshold * frames
-
-        # Filter out values that appear in less than 20% of 1000
         filtered_values = [
             value for value, count in occurrences.items() if count >= threshold
         ]
 
-        # Append the filtered values as columns to the DataFrame
         for value in filtered_values:
             df[value] = None
 
         return filtered_values
 
-    def unique_data_generation(self, filtered_values):
-        """Generate a dictionary conataing the unique interactions from a list of filtered values obtained by filtering_values.
+    def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]:
+        """Generate a dictionary containing the unique interactions from a list of filtered values obtained by filtering_values.
 
         Args:
             filtered_values (list): A list of values, where the unique interactions are extracted from.
@@ -348,25 +259,19 @@ def unique_data_generation(self, filtered_values):
         Returns:
             dict: A dictionary containing the filtered unique interactions.
         """
-        # Create a new dictionary to store unique values
         unique_data = {}
-
-        # Loop through the elements of the data_list
         for element in filtered_values:
-            # Check if the element is not already in the unique_data dictionary keys
             if element not in unique_data:
-                # If not, add the element as both key and value
                 unique_data[element] = element
 
         return unique_data
 
-    def df_iteration_numbering(self, df, unique_data):
+  def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) -> None:
         """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present.
 
         Args:
-            df (pandas dataframe): DataFrame which has the interaction data for all of the frames.
-            unique_data (dict): Dictionary that contains the unique interactions obtained from unique_data_generation.
-            peptide (str, optional): name of the peptide chainid in the original topology. Defaults to None.
+            df (pd.DataFrame): DataFrame which has the interaction data for all of the frames.
+            unique_data (Dict[str, str]): Dictionary that contains the unique interactions obtained from unique_data_generation.
         """
         if self.peptide is None:
             for index, row in df.iterrows():
@@ -382,13 +287,11 @@ def df_iteration_numbering(self, df, unique_data):
                                     row["INTERACTION"] == interaction
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                            else:
-                                continue
                 elif row["INTERACTION"] == "hbond":
                     if row["PROTISDON"] == True:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 ligcarbonidx = int(ligcarbonidx)
@@ -401,7 +304,7 @@ def df_iteration_numbering(self, df, unique_data):
                     elif row["PROTISDON"] == False:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 ligcarbonidx = int(ligcarbonidx)
@@ -438,8 +341,8 @@ def df_iteration_numbering(self, df, unique_data):
                 elif row["INTERACTION"] == "waterbridge":
                     for col in unique_data.values():
                         if "waterbridge" in col:
-                            if row["PROTISDON"] == True:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                            if row["PROTISDON"]:
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -448,8 +351,8 @@ def df_iteration_numbering(self, df, unique_data):
                                     & (row["INTERACTION"] == interaction)
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                            elif row["PROTISDON"] == False:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                            else:
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -482,7 +385,7 @@ def df_iteration_numbering(self, df, unique_data):
                             ligidx = parts[1:-3]
                             ligidx = ",".join(ligidx)
                             lig_group = parts[-3]
-                            type = parts[-2]
+                            _type = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
@@ -506,7 +409,8 @@ def df_iteration_numbering(self, df, unique_data):
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
-        if self.peptide is not None:
+
+        else:  # If self.peptide is not None
             for index, row in df.iterrows():
                 if row["INTERACTION"] == "hydrophobic":
                     for col in unique_data.values():
@@ -520,13 +424,11 @@ def df_iteration_numbering(self, df, unique_data):
                                     row["INTERACTION"] == interaction
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                            else:
-                                continue
                 elif row["INTERACTION"] == "hbond":
-                    if row["PROTISDON"] == True:
+                    if row["PROTISDON"]:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -538,10 +440,10 @@ def df_iteration_numbering(self, df, unique_data):
                                     & (row["INTERACTION"] == interaction)
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                    elif row["PROTISDON"] == False:
+                    else:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, type, interaction = (
+                                prot_partner, ligcarbonidx, _type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -585,46 +487,31 @@ def df_iteration_numbering(self, df, unique_data):
                 elif row["INTERACTION"] == "waterbridge":
                     for col in unique_data.values():
                         if "waterbridge" in col:
-                            if row["PROTISDON"] == True:
-                                prot_partner, ligcarbonidx, type, interaction = (
-                                    col.split("_")
-                                )
-                                condition = (
-                                    (row["Prot_partner"] == prot_partner)
-                                    & (
-                                        (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                        == ligcarbonidx
-                                    )
-                                    & (row["INTERACTION"] == interaction)
-                                )
-                                df.at[index, col] = 1 if condition else 0
-                            elif row["PROTISDON"] == False:
-                                prot_partner, ligcarbonidx, type, interaction = (
-                                    col.split("_")
-                                )
-                                condition = (
-                                    (row["Prot_partner"] == prot_partner)
-                                    & (
-                                        (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                        == ligcarbonidx
-                                    )
-                                    & (row["INTERACTION"] == interaction)
+                            prot_partner, ligcarbonidx, _type, interaction = (
+                                col.split("_")
+                            )
+                            condition = (
+                                (row["Prot_partner"] == prot_partner)
+                                & (
+                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                    == ligcarbonidx
                                 )
-                                df.at[index, col] = 1 if condition else 0
+                                & (row["INTERACTION"] == interaction)
+                            )
+                            df.at[index, col] = 1 if condition else 0
                 elif row["INTERACTION"] == "pication":
                     for col in unique_data.values():
                         if "pication" in col:
                             parts = col.split("_")
                             prot_partner = parts[0]
-                            ligidx = parts[1]
+                            ligidx = parts[1:-2]
+                            ligidx = ",".join(ligidx)
                             ligtype = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
-                                & (
-                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                    == ligidx
-                                )
+                                & (row["RESNR_LIG"] == ligidx)
+                                & (row["RESTYPE_LIG"] == ligtype)
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
@@ -633,16 +520,15 @@ def df_iteration_numbering(self, df, unique_data):
                         if "saltbridge" in col:
                             parts = col.split("_")
                             prot_partner = parts[0]
-                            ligidx = parts[1]
+                            ligidx = parts[1:-3]
+                            ligidx = ",".join(ligidx)
                             lig_group = parts[-3]
-                            type = parts[-2]
+                            _type = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
-                                & (
-                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                    == ligidx
-                                )
+                                & (row["RESNR_LIG"] == ligidx)
+                                & (row["RESTYPE_LIG"] == lig_group)
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
@@ -651,29 +537,29 @@ def df_iteration_numbering(self, df, unique_data):
                         if "metal" in col:
                             parts = col.split("_")
                             special_ligand = parts[0]
-                            ligidx = parts[1]
+                            ligidx = int(parts[1])
                             metal_type = parts[2]
                             interaction = parts[-1]
                             condition = (
                                 (row["RESTYPE_LIG"] == special_ligand)
-                                & (
-                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                    == ligidx
-                                )
+                                & (int(row["TARGET_IDX"]) == ligidx)
                                 & (row["METAL_TYPE"] == metal_type)
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
 
-    def update_values(self, df, new, unique_data):
-        """Update the values in the input DataFrame based upon the frame values and an reference DataFrame.
+   from typing import Dict
+import pandas as pd
 
-        Args:
-            df (pandas dataframe): Input DataFrame that will be updated.
-            new (pandas dataframe): The reference DataFrame containing values that are used to update the input DataFrame.
-            unique_data (dict): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame.
-        """
-        for idx, row in df.iterrows():
-            frame_value = row["FRAME"]
-            values_to_update = new.loc[frame_value, list(unique_data.values())]
-            df.loc[idx, list(unique_data.values())] = values_to_update
+def update_values(self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]) -> None:
+    """Update the values in the input DataFrame based upon the frame values and a reference DataFrame.
+
+    Args:
+        df (pandas.DataFrame): Input DataFrame that will be updated.
+        new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame.
+        unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame.
+    """
+    for idx, row in df.iterrows():
+        frame_value = row["FRAME"]
+        values_to_update = new.loc[frame_value, list(unique_data.values())]
+        df.loc[idx, list(unique_data.values())] = values_to_update

From 6ff92f5ebddef8abe1720b27209fcd0e2cd79bd0 Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Mon, 19 Aug 2024 15:55:09 +0200
Subject: [PATCH 11/23] Update interaction_gathering.py

added type checking and hinting
---
 .../interaction_gathering.py                  | 268 +++++++++++-------
 1 file changed, 160 insertions(+), 108 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py
index 30e697be..c36ba09b 100644
--- a/openmmdl/openmmdl_analysis/interaction_gathering.py
+++ b/openmmdl/openmmdl_analysis/interaction_gathering.py
@@ -6,27 +6,29 @@
 from plip.structure.preparation import PDBComplex, PLInteraction
 from plip.exchange.report import BindingSiteReport
 from multiprocessing import Pool
-from typing import Optional, List, Dict, Tuple
+from functools import partial
+from typing import List, Dict, Tuple, Optional, Union
 
 config.KEEPMOD = True
 
+
 class InteractionAnalyzer:
     def __init__(
-        self,
-        pdb_md: mda.Universe,
-        dataframe: Optional[str],
-        num_processes: int,
-        lig_name: str,
-        special_ligand: Optional[str],
+        self, 
+        pdb_md: mda.Universe, 
+        dataframe: Optional[str], 
+        num_processes: int, 
+        lig_name: str, 
+        special_ligand: Optional[str], 
         peptide: Optional[str]
-    ):
+    ) -> None:
         self.pdb_md = pdb_md
         self.dataframe = dataframe
         self.num_processes = num_processes
         self.lig_name = lig_name
         self.special = special_ligand
         self.peptide = peptide
-        self.ineraction_list = self.process_trajectory()
+        self.interaction_list = self.process_trajectory()
 
     def characterize_complex(
         self, pdb_file: str, binding_site_id: str
@@ -38,7 +40,7 @@ def characterize_complex(
             binding_site_id (str): Identifier of the binding site.
 
         Returns:
-            Optional[PLInteraction]: Interaction object or None if the binding site is not found.
+            Optional[PLInteraction]: Interactions if found; otherwise, None.
         """
         pdb_complex = PDBComplex()
         pdb_complex.load_pdb(pdb_file)
@@ -48,28 +50,28 @@ def characterize_complex(
                 == binding_site_id
             ):
                 pdb_complex.characterize_complex(ligand)
-                return pdb_complex.interaction_sets.get(binding_site_id)
+                return pdb_complex.interaction_sets[binding_site_id]
+
         return None
 
     def retrieve_plip_interactions(
         self, pdb_file: str, lig_name: str
     ) -> Dict[str, Dict[str, List]]:
-        """Retrieves the interactions from PLIP.
+        """Retrieve interactions from PLIP.
 
         Args:
-            pdb_file (str): Path of the PDB file of the complex.
-            lig_name (str): Name of the ligand in the complex.
+            pdb_file (str): Path to the PDB file of the complex.
+            lig_name (str): Name of the ligand in the complex topology.
 
         Returns:
-            Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions.
+            Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions.
         """
         protlig = PDBComplex()
         protlig.load_pdb(pdb_file)
         for ligand in protlig.ligands:
             if str(ligand.longname) == lig_name:
                 protlig.characterize_complex(ligand)
-        
-        sites = {}
+        sites: Dict[str, Dict[str, List]] = {}
         for key, site in sorted(protlig.interaction_sets.items()):
             binding_site = BindingSiteReport(site)
             keys = (
@@ -94,19 +96,18 @@ def retrieve_plip_interactions(
     def retrieve_plip_interactions_peptide(
         self, pdb_file: str
     ) -> Dict[str, Dict[str, List]]:
-        """Retrieves the interactions from PLIP for a peptide.
+        """Retrieve interactions from PLIP for a peptide.
 
         Args:
-            pdb_file (str): Path of the PDB file of the complex.
+            pdb_file (str): Path to the PDB file of the complex.
 
         Returns:
-            Dict[str, Dict[str, List]]: Dictionary of binding sites and their interactions.
+            Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions.
         """
         protlig = PDBComplex()
         protlig.load_pdb(pdb_file)
         protlig.characterize_complex(protlig.ligands[-1])
-        
-        sites = {}
+        sites: Dict[str, Dict[str, List]] = {}
         for key, site in sorted(protlig.interaction_sets.items()):
             binding_site = BindingSiteReport(site)
             keys = (
@@ -131,16 +132,16 @@ def retrieve_plip_interactions_peptide(
     def create_df_from_binding_site(
         self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond"
     ) -> pd.DataFrame:
-        """Creates a DataFrame from a binding site and interaction type.
+        """Create a DataFrame from a binding site and interaction type.
 
         Args:
-            selected_site_interactions (Dict[str, List]): Pre-calculated interactions from PLIP.
-            interaction_type (str, optional): The interaction type of interest. Defaults to "hbond".
+            selected_site_interactions (Dict[str, List]): Precalculated interactions from PLIP for the selected site.
+            interaction_type (str, optional): Interaction type of interest (default is "hbond").
 
         Returns:
-            pd.DataFrame: DataFrame with information retrieved from PLIP.
+            pd.DataFrame: DataFrame with information from PLIP.
         """
-        valid_types = (
+        valid_types = [
             "hydrophobic",
             "hbond",
             "waterbridge",
@@ -149,7 +150,7 @@ def create_df_from_binding_site(
             "pication",
             "halogen",
             "metal",
-        )
+        ]
 
         if interaction_type not in valid_types:
             print(
@@ -166,11 +167,11 @@ def create_df_from_binding_site(
     def change_lig_to_residue(
         self, file_path: str, new_residue_name: str
     ) -> None:
-        """Reformats the topology file to change the ligand to a residue.
+        """Reformat the topology file to change the ligand to a residue.
 
         Args:
-            file_path (str): Filepath of the topology file.
-            new_residue_name (str): New residue name of the ligand.
+            file_path (str): Path to the topology file.
+            new_residue_name (str): New residue name.
         """
         with open(file_path, "r") as file:
             lines = file.readlines()
@@ -189,21 +190,64 @@ def change_lig_to_residue(
                 else:
                     file.write(line)
 
+    def process_frame_special(self, frame: int) -> pd.DataFrame:
+        """Process a single frame of MD simulation for special ligands.
+
+        Args:
+            frame (int): Frame number to be processed.
+
+        Returns:
+            pd.DataFrame: DataFrame containing interaction data for the processed frame.
+        """
+        atoms_selected = self.pdb_md.select_atoms(
+            f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
+        )
+        for num in self.pdb_md.trajectory[(frame):(frame + 1)]:
+            atoms_selected.write(f"processing_frame_{frame}.pdb")
+        interaction_list = pd.DataFrame()
+        interactions_by_site = self.retrieve_plip_interactions(
+            f"processing_frame_{frame}.pdb", self.lig_name
+        )
+        index_of_selected_site = -1
+        selected_site = list(interactions_by_site.keys())[index_of_selected_site]
+
+        interaction_types = [
+            "hydrophobic",
+            "hbond",
+            "waterbridge",
+            "saltbridge",
+            "pistacking",
+            "pication",
+            "halogen",
+            "metal",
+        ]
+
+        for interaction_type in interaction_types:
+            tmp_interaction = self.create_df_from_binding_site(
+                interactions_by_site[selected_site],
+                interaction_type=interaction_type,
+            )
+            tmp_interaction["frame"] = frame
+            tmp_interaction["interaction_type"] = interaction_type
+            interaction_list = pd.concat([interaction_list, tmp_interaction])
+        
+        os.remove(f"processing_frame_{frame}.pdb")
+        return interaction_list
+
     def process_frame(self, frame: int) -> pd.DataFrame:
         """Process a single frame of MD simulation.
 
         Args:
-            frame (int): The number of the frame that will be processed.
+            frame (int): Frame number to be processed.
 
         Returns:
-            pd.DataFrame: DataFrame containing the interaction data for the processed frame.
+            pd.DataFrame: DataFrame containing interaction data for the processed frame.
         """
         atoms_selected = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
         )
         for num in self.pdb_md.trajectory[(frame):(frame + 1)]:
             atoms_selected.write(f"processing_frame_{frame}.pdb")
-
         interaction_list = pd.DataFrame()
         if self.peptide is None:
             interactions_by_site = self.retrieve_plip_interactions(
@@ -228,90 +272,98 @@ def process_frame(self, frame: int) -> pd.DataFrame:
                     interactions_by_site[selected_site],
                     interaction_type=interaction_type,
                 )
-                tmp_interaction["FRAME"] = int(frame)
-                tmp_interaction["INTERACTION"] = interaction_type
+                tmp_interaction["frame"] = frame
+                tmp_interaction["interaction_type"] = interaction_type
                 interaction_list = pd.concat([interaction_list, tmp_interaction])
-
-            if os.path.exists(f"processing_frame_{frame}.pdb"):
-                os.remove(f"processing_frame_{frame}.pdb")
-
-            if self.special is not None:
-                combi_lig_special = mda.Universe("ligand_special.pdb")
-                complex = mda.Universe("complex.pdb")
-                complex_all = complex.select_atoms("all")
-                result = self.process_frame_special(frame)
-                results_df = pd.concat(result, ignore_index=True)
-                results_df = results_df[results_df["LOCATION"] == "protein.sidechain"]
-                results_df["RESTYPE"] = results_df["RESTYPE"].replace(
-                    ["HIS", "SER", "CYS"], self.lig_name
-                )
-                results_df["LOCATION"] = results_df["LOCATION"].replace(
-                    "protein.sidechain", "ligand"
+        else:
+            interactions_by_site = self.retrieve_plip_interactions_peptide(
+                f"processing_frame_{frame}.pdb"
+            )
+            selected_site = list(interactions_by_site.keys())[0]
+            interaction_types = [
+                "hydrophobic",
+                "hbond",
+                "waterbridge",
+                "saltbridge",
+                "pistacking",
+                "pication",
+                "halogen",
+                "metal",
+            ]
+            for interaction_type in interaction_types:
+                tmp_interaction = self.create_df_from_binding_site(
+                    interactions_by_site[selected_site],
+                    interaction_type=interaction_type,
                 )
-                updated_target_idx = []
-
-                for index, row in results_df.iterrows():
-                    ligand_special_int_nr = int(row["TARGET_IDX"])
-                    ligand_special_int_nr_atom = combi_lig_special.select_atoms(
-                        f"id {ligand_special_int_nr}"
-                    )
-                    for atom in ligand_special_int_nr_atom:
-                        atom_name = atom.name
-                        if atom_name in ["N", "C", "O", "S"]:
-                            atom_name = f"{atom_name}1"
-                        else:
-                            base_name, atom_number = atom_name[:-1], int(atom_name[-1])
-                            new_atom_number = atom_number + 1
-                            atom_name = f"{base_name}{new_atom_number}"
-                        for complex_atom in complex_all:
-                            complex_atom_name = complex_atom.name
-                            if complex_atom_name == atom_name:
-                                updated_target_idx.append(row["TARGET_IDX"])
-                                break
-
-                results_df["TARGET_IDX"] = updated_target_idx
-                results_df = results_df[
-                    (results_df["INTERACTION"] == "hbond") & (results_df["TARGET_IDX"].notna())
-                ]
-                interaction_list = pd.concat([interaction_list, results_df])
-
+                tmp_interaction["frame"] = frame
+                tmp_interaction["interaction_type"] = interaction_type
+                interaction_list = pd.concat([interaction_list, tmp_interaction])
+        
+        os.remove(f"processing_frame_{frame}.pdb")
         return interaction_list
 
-    def process_trajectory(self) -> pd.DataFrame:
-        """Process the entire trajectory of the MD simulation.
+    def process_frame_wrapper(self, frame: int) -> pd.DataFrame:
+        """Wrapper function to handle processing of a frame, including special cases.
+
+        Args:
+            frame (int): Frame number to be processed.
 
         Returns:
-            pd.DataFrame: DataFrame containing interaction data across all frames.
+            pd.DataFrame: DataFrame containing interaction data for the processed frame.
         """
-        total_frames = len(self.pdb_md.trajectory)
-        frame_range = range(total_frames)
-        with Pool(processes=self.num_processes) as pool:
-            results = list(tqdm(pool.imap(self.process_frame, frame_range), total_frames))
-        
-        all_interactions = pd.concat(results, ignore_index=True)
-        return all_interactions
+        if self.special:
+            return self.process_frame_special(frame)
+        return self.process_frame(frame)
 
-    def process_frame_special(self, frame: int) -> List[pd.DataFrame]:
-        """Placeholder method for processing special frame interactions.
+    def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Fill in missing frames in the DataFrame with NaNs.
 
         Args:
-            frame (int): The number of the frame to process.
+            df (pd.DataFrame): Original DataFrame with potentially missing frames.
 
         Returns:
-            List[pd.DataFrame]: List of DataFrames with interaction data.
+            pd.DataFrame: DataFrame with missing frames filled.
         """
-        # Implement specific frame processing for special cases if needed.
-        return []
-
-# Example of usage:
-# pdb_md = mda.Universe('your_pdb_file.pdb')
-# analyzer = InteractionAnalyzer(
-#     pdb_md=pdb_md,
-#     dataframe='your_dataframe.csv',
-#     num_processes=4,
-#     lig_name='LIG',
-#     special_ligand='SPL',
-#     peptide='PEP'
-# )
-# df = analyzer.ineraction_list
-# print(df)
+        all_frames = set(range(len(self.pdb_md.trajectory)))
+        existing_frames = set(df["frame"].unique())
+        missing_frames = all_frames - existing_frames
+        missing_data = []
+
+        for frame in missing_frames:
+            for interaction_type in df["interaction_type"].unique():
+                missing_data.append({
+                    "frame": frame,
+                    "interaction_type": interaction_type,
+                    # Add other necessary columns with NaNs or empty values as appropriate
+                })
+
+        missing_df = pd.DataFrame(missing_data)
+        filled_df = pd.concat([df, missing_df], ignore_index=True)
+        return filled_df
+
+    def process_trajectory(self) -> pd.DataFrame:
+        """Main method for processing the trajectory.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing all interaction data for the trajectory.
+        """
+        pool = Pool(processes=self.num_processes)
+        with tqdm(total=len(self.pdb_md.trajectory)) as pbar:
+            interaction_list = pd.concat(
+                list(
+                    tqdm(
+                        pool.imap(self.process_frame_wrapper, range(len(self.pdb_md.trajectory))),
+                        total=len(self.pdb_md.trajectory),
+                    )
+                )
+            )
+        pool.close()
+        pool.join()
+
+        if self.dataframe:
+            interaction_list.to_csv(self.dataframe)
+        
+        # Ensure all frames are represented in the final DataFrame
+        interaction_list = self.fill_missing_frames(interaction_list)
+        
+        return interaction_list

From 59c621296026e3a5a65262a604bc116c85e37abf Mon Sep 17 00:00:00 2001
From: ARMI <armenianapoli@gmail.com>
Date: Thu, 22 Aug 2024 10:59:17 +0200
Subject: [PATCH 12/23] Add files via upload

---
 .../openmmdl_analysis/barcode_generation.py   |   2 +-
 .../binding_mode_processing.py                |  67 ++++----
 .../openmmdl_analysis/find_stable_waters.py   |  25 +--
 .../interaction_gathering.py                  |  51 +++---
 .../markov_state_figure_generation.py         | 145 +++++++++++++-----
 .../openmmdl_analysis/openmmdlanalysis.py     |   2 +-
 openmmdl/openmmdl_analysis/pharmacophore.py   |  33 ++--
 openmmdl/openmmdl_analysis/preprocessing.py   |  44 +++++-
 .../rdkit_figure_generation.py                |  76 +++++----
 .../openmmdl_analysis/rmsd_calculation.py     |   4 +-
 .../openmmdl_analysis/visualization.ipynb     |  10 +-
 .../visualization_functions.py                |  29 +++-
 12 files changed, 335 insertions(+), 153 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py
index 2ec99d02..ee804f9b 100644
--- a/openmmdl/openmmdl_analysis/barcode_generation.py
+++ b/openmmdl/openmmdl_analysis/barcode_generation.py
@@ -187,7 +187,7 @@ def plot_waterbridge_piechart(
         self,
         waterbridge_barcodes: Dict[str, np.ndarray],
         waterbridge_interactions: List[str],
-        fig_type: str
+        fig_type: str,
     ) -> None:
         """
         Plots pie charts for waterbridge interactions and saves them to files.
diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py
index 0d326b19..135694d0 100644
--- a/openmmdl/openmmdl_analysis/binding_mode_processing.py
+++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py
@@ -10,16 +10,16 @@
 from typing import Dict, List, Union, Optional, Tuple
 
 
-class BindingModeProcessor:
+class BindingModeProcesser:
     def __init__(
         self,
-        pdb_md: 'MDAnalysis.Universe',  # Placeholder for the actual type
+        pdb_md,
         ligand: str,
         peptide: Optional[str],
         special: str,
         ligand_rings: List[List[int]],
         interaction_list: pd.DataFrame,
-        threshold: float
+        threshold: float,
     ) -> None:
         self.pdb_md = pdb_md
         self.ligand = ligand
@@ -65,11 +65,11 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
                     ring_found = False
                     for ligand_ring in self.ligand_rings:
                         if ligcarbonidx in ligand_ring:
-                            numbers_as_strings = [
-                                str(idx) for idx in ligand_ring
-                            ]
+                            numbers_as_strings = [str(idx) for idx in ligand_ring]
                             name_with_numbers = "_".join(numbers_as_strings)
-                            col_name = f"{prot_partner}_{name_with_numbers}_{interaction}"
+                            col_name = (
+                                f"{prot_partner}_{name_with_numbers}_{interaction}"
+                            )
                             ring_found = True
                             break
                     if not ring_found:
@@ -109,7 +109,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
                         type_ = "NI"
                     else:
                         type_ = "PI"
-                    col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                    col_name = (
+                        f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                    )
                 elif interaction == "metal":
                     special_ligand = row["RESTYPE_LIG"]
                     ligcarbonidx = int(row["TARGET_IDX"])
@@ -140,7 +142,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
                     col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}"
                 elif interaction == "halogen":
                     halogen = row["DONORTYPE"]
-                    col_name = f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}"
+                    col_name = (
+                        f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}"
+                    )
                 elif interaction == "waterbridge":
                     if row["PROTISDON"]:
                         type_ = "Acceptor"
@@ -161,7 +165,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
                         type_ = "NI"
                     else:
                         type_ = "PI"
-                    col_name = f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                    col_name = (
+                        f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
+                    )
                 elif interaction == "metal":
                     special_ligand = row["RESTYPE_LIG"]
                     ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
@@ -179,7 +185,9 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
         print("\033[1minteraction partners generated\033[0m")
         return unique_columns_rings_grouped
 
-    def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int, Dict[int, str]]:
+    def remove_duplicate_values(
+        self, data: Dict[int, Dict[int, str]]
+    ) -> Dict[int, Dict[int, str]]:
         """Remove the duplicate values from sub-dictionaries within the input dictionary.
 
         Args:
@@ -204,7 +212,9 @@ def remove_duplicate_values(self, data: Dict[int, Dict[int, str]]) -> Dict[int,
 
         return unique_data
 
-    def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, List[str]]:
+    def combine_subdict_values(
+        self, data: Dict[int, Dict[int, str]]
+    ) -> Dict[str, List[str]]:
         """Combines the values from the individual sub-dictionaries into a single list.
 
         Args:
@@ -219,9 +229,7 @@ def combine_subdict_values(self, data: Dict[int, Dict[int, str]]) -> Dict[str, L
 
         return combined_data
 
-    def filtering_values(
-        self, threshold: float, df: pd.DataFrame
-    ) -> List[str]:
+    def filtering_values(self, threshold: float, df: pd.DataFrame) -> List[str]:
         """Filter and append values (interactions) to a DataFrame based on occurrence counts.
 
         Args:
@@ -266,7 +274,9 @@ def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]:
 
         return unique_data
 
-  def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str]) -> None:
+    def df_iteration_numbering(
+        self, df: pd.DataFrame, unique_data: Dict[str, str]
+    ) -> None:
         """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present.
 
         Args:
@@ -487,8 +497,8 @@ def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str])
                 elif row["INTERACTION"] == "waterbridge":
                     for col in unique_data.values():
                         if "waterbridge" in col:
-                            prot_partner, ligcarbonidx, _type, interaction = (
-                                col.split("_")
+                            prot_partner, ligcarbonidx, _type, interaction = col.split(
+                                "_"
                             )
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
@@ -548,18 +558,17 @@ def df_iteration_numbering(self, df: pd.DataFrame, unique_data: Dict[str, str])
                             )
                             df.at[index, col] = 1 if condition else 0
 
-   from typing import Dict
-import pandas as pd
-
-def update_values(self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]) -> None:
-    """Update the values in the input DataFrame based upon the frame values and a reference DataFrame.
+    def update_values(
+        self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]
+    ) -> None:
+        """Update the values in the input DataFrame based upon the frame values and a reference DataFrame.
 
-    Args:
+        Args:
         df (pandas.DataFrame): Input DataFrame that will be updated.
         new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame.
         unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame.
-    """
-    for idx, row in df.iterrows():
-        frame_value = row["FRAME"]
-        values_to_update = new.loc[frame_value, list(unique_data.values())]
-        df.loc[idx, list(unique_data.values())] = values_to_update
+        """
+        for idx, row in df.iterrows():
+            frame_value = row["FRAME"]
+            values_to_update = new.loc[frame_value, list(unique_data.values())]
+            df.loc[idx, list(unique_data.values())] = values_to_update
diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py
index 88e6ccb2..390d4fc1 100644
--- a/openmmdl/openmmdl_analysis/find_stable_waters.py
+++ b/openmmdl/openmmdl_analysis/find_stable_waters.py
@@ -8,6 +8,7 @@
 from Bio.PDB import PDBParser, Structure
 from typing import Tuple, Dict, List, Optional
 
+
 class StableWaters:
     def __init__(self, trajectory: str, topology: str, water_eps: float) -> None:
         self.trajectory = trajectory
@@ -49,9 +50,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
                 stable_coords = []
 
                 for atom_index, coords in frame_coords.items():
-                    prev_coords = prev_frame_coords.get(
-                        atom_index, coords
-                    )
+                    prev_coords = prev_frame_coords.get(atom_index, coords)
 
                     distance = np.linalg.norm(np.array(coords) - np.array(prev_coords))
 
@@ -85,7 +84,11 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
         return stable_waters, total_frames
 
     def perform_clustering_and_writing(
-        self, stable_waters: pd.DataFrame, cluster_eps: float, total_frames: int, output_directory: str
+        self,
+        stable_waters: pd.DataFrame,
+        cluster_eps: float,
+        total_frames: int,
+        output_directory: str,
     ) -> None:
         """
         Perform DBSCAN clustering on the stable water coordinates, and write the clusters and their representatives to PDB files.
@@ -121,7 +124,10 @@ def perform_clustering_and_writing(
             )
 
     def write_pdb_clusters_and_representatives(
-        self, clustered_waters: pd.DataFrame, min_samples: int, output_sub_directory: str
+        self,
+        clustered_waters: pd.DataFrame,
+        min_samples: int,
+        output_sub_directory: str,
     ) -> None:
         """
         Writes the clusters and their representatives to PDB files.
@@ -136,9 +142,7 @@ def write_pdb_clusters_and_representatives(
         print("minsamples:")
         print(min_samples)
         os.makedirs(output_sub_directory, exist_ok=True)
-        with pd.option_context(
-            "display.max_rows", None
-        ):
+        with pd.option_context("display.max_rows", None):
             for label, cluster in clustered_waters.groupby("Cluster_Label"):
                 pdb_lines = []
                 for _, row in cluster.iterrows():
@@ -210,7 +214,10 @@ def filter_and_parse_pdb(self, protein_pdb: str) -> Structure:
         return structure
 
     def find_interacting_residues(
-        self, structure: Structure, representative_waters: pd.DataFrame, distance_threshold: float
+        self,
+        structure: Structure,
+        representative_waters: pd.DataFrame,
+        distance_threshold: float,
     ) -> Dict[int, List[Tuple[str, int]]]:
         """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input.
         Args:
diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py
index c36ba09b..08d0d8f3 100644
--- a/openmmdl/openmmdl_analysis/interaction_gathering.py
+++ b/openmmdl/openmmdl_analysis/interaction_gathering.py
@@ -14,13 +14,13 @@
 
 class InteractionAnalyzer:
     def __init__(
-        self, 
-        pdb_md: mda.Universe, 
-        dataframe: Optional[str], 
-        num_processes: int, 
-        lig_name: str, 
-        special_ligand: Optional[str], 
-        peptide: Optional[str]
+        self,
+        pdb_md: mda.Universe,
+        dataframe: Optional[str],
+        num_processes: int,
+        lig_name: str,
+        special_ligand: Optional[str],
+        peptide: Optional[str],
     ) -> None:
         self.pdb_md = pdb_md
         self.dataframe = dataframe
@@ -130,7 +130,9 @@ def retrieve_plip_interactions_peptide(
         return sites
 
     def create_df_from_binding_site(
-        self, selected_site_interactions: Dict[str, List], interaction_type: str = "hbond"
+        self,
+        selected_site_interactions: Dict[str, List],
+        interaction_type: str = "hbond",
     ) -> pd.DataFrame:
         """Create a DataFrame from a binding site and interaction type.
 
@@ -164,9 +166,7 @@ def create_df_from_binding_site(
         )
         return df
 
-    def change_lig_to_residue(
-        self, file_path: str, new_residue_name: str
-    ) -> None:
+    def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None:
         """Reformat the topology file to change the ligand to a residue.
 
         Args:
@@ -202,7 +202,7 @@ def process_frame_special(self, frame: int) -> pd.DataFrame:
         atoms_selected = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
         )
-        for num in self.pdb_md.trajectory[(frame):(frame + 1)]:
+        for num in self.pdb_md.trajectory[(frame) : (frame + 1)]:
             atoms_selected.write(f"processing_frame_{frame}.pdb")
         interaction_list = pd.DataFrame()
         interactions_by_site = self.retrieve_plip_interactions(
@@ -230,7 +230,7 @@ def process_frame_special(self, frame: int) -> pd.DataFrame:
             tmp_interaction["frame"] = frame
             tmp_interaction["interaction_type"] = interaction_type
             interaction_list = pd.concat([interaction_list, tmp_interaction])
-        
+
         os.remove(f"processing_frame_{frame}.pdb")
         return interaction_list
 
@@ -246,7 +246,7 @@ def process_frame(self, frame: int) -> pd.DataFrame:
         atoms_selected = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
         )
-        for num in self.pdb_md.trajectory[(frame):(frame + 1)]:
+        for num in self.pdb_md.trajectory[(frame) : (frame + 1)]:
             atoms_selected.write(f"processing_frame_{frame}.pdb")
         interaction_list = pd.DataFrame()
         if self.peptide is None:
@@ -298,7 +298,7 @@ def process_frame(self, frame: int) -> pd.DataFrame:
                 tmp_interaction["frame"] = frame
                 tmp_interaction["interaction_type"] = interaction_type
                 interaction_list = pd.concat([interaction_list, tmp_interaction])
-        
+
         os.remove(f"processing_frame_{frame}.pdb")
         return interaction_list
 
@@ -331,11 +331,13 @@ def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame:
 
         for frame in missing_frames:
             for interaction_type in df["interaction_type"].unique():
-                missing_data.append({
-                    "frame": frame,
-                    "interaction_type": interaction_type,
-                    # Add other necessary columns with NaNs or empty values as appropriate
-                })
+                missing_data.append(
+                    {
+                        "frame": frame,
+                        "interaction_type": interaction_type,
+                        # Add other necessary columns with NaNs or empty values as appropriate
+                    }
+                )
 
         missing_df = pd.DataFrame(missing_data)
         filled_df = pd.concat([df, missing_df], ignore_index=True)
@@ -352,7 +354,10 @@ def process_trajectory(self) -> pd.DataFrame:
             interaction_list = pd.concat(
                 list(
                     tqdm(
-                        pool.imap(self.process_frame_wrapper, range(len(self.pdb_md.trajectory))),
+                        pool.imap(
+                            self.process_frame_wrapper,
+                            range(len(self.pdb_md.trajectory)),
+                        ),
                         total=len(self.pdb_md.trajectory),
                     )
                 )
@@ -362,8 +367,8 @@ def process_trajectory(self) -> pd.DataFrame:
 
         if self.dataframe:
             interaction_list.to_csv(self.dataframe)
-        
+
         # Ensure all frames are represented in the final DataFrame
         interaction_list = self.fill_missing_frames(interaction_list)
-        
+
         return interaction_list
diff --git a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
index 7e40c020..220acff8 100644
--- a/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
+++ b/openmmdl/openmmdl_analysis/markov_state_figure_generation.py
@@ -4,6 +4,7 @@
 import os
 from typing import Dict, List, Tuple, Union
 
+
 class MarkovChainAnalysis:
     def __init__(self, min_transition: float):
         self.min_transition = min_transition
@@ -29,7 +30,7 @@ def generate_transition_graph(
         combined_dict: Dict[str, List[str]],
         fig_type: str = "png",
         font_size: int = 36,
-        size_node: int = 200
+        size_node: int = 200,
     ) -> None:
         """Generate Markov Chain plots based on transition probabilities.
 
@@ -49,8 +50,10 @@ def generate_transition_graph(
         part1_length: int = part_length + remaining_length
         part2_length: int = part_length
         part1_data: List[str] = combined_dict["all"][:part1_length]
-        part2_data: List[str] = combined_dict["all"][part1_length: part1_length + part2_length]
-        part3_data: List[str] = combined_dict["all"][part1_length + part2_length:]
+        part2_data: List[str] = combined_dict["all"][
+            part1_length : part1_length + part2_length
+        ]
+        part3_data: List[str] = combined_dict["all"][part1_length + part2_length :]
 
         # Count the occurrences of each node in each part
         part1_node_occurrences: Dict[str, int] = {
@@ -83,10 +86,14 @@ def generate_transition_graph(
         node_occurrences: Dict[str, int] = {
             node: combined_dict["all"].count(node) for node in set(combined_dict["all"])
         }
-        top_10_nodes: List[str] = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10]
+        top_10_nodes: List[str] = sorted(
+            node_occurrences, key=node_occurrences.get, reverse=True
+        )[:10]
 
         for min_transition_percent in self.min_transitions:
-            min_prob: float = min_transition_percent / 100  # Convert percentage to probability
+            min_prob: float = (
+                min_transition_percent / 100
+            )  # Convert percentage to probability
 
             # Create a directed graph
             G: nx.DiGraph = nx.DiGraph()
@@ -108,19 +115,32 @@ def generate_transition_graph(
             # Add edges to the graph with their probabilities
             for transition, count in transitions.items():
                 current_state, next_state = transition
-                probability: float = (count / len(combined_dict["all"])) * 100  # Convert probability to percentage
+                probability: float = (
+                    count / len(combined_dict["all"])
+                ) * 100  # Convert probability to percentage
                 if probability >= min_transition_percent:
                     G.add_edge(current_state, next_state, weight=probability)
                     # Include the reverse transition with a different color
                     reverse_transition: Tuple[str, str] = (next_state, current_state)
-                    reverse_count: int = transitions.get(reverse_transition, 0)  # Use the correct count for the reverse transition
-                    reverse_probability: float = (reverse_count / len(combined_dict["all"])) * 100
-                    G.add_edge(next_state, current_state, weight=reverse_probability, reverse=True)
+                    reverse_count: int = transitions.get(
+                        reverse_transition, 0
+                    )  # Use the correct count for the reverse transition
+                    reverse_probability: float = (
+                        reverse_count / len(combined_dict["all"])
+                    ) * 100
+                    G.add_edge(
+                        next_state,
+                        current_state,
+                        weight=reverse_probability,
+                        reverse=True,
+                    )
 
             # Add self-loops to the graph with their probabilities
             for self_loop, count in self_loops.items():
                 state: str = self_loop[0]
-                probability: float = (count / len(combined_dict["all"])) * 100  # Convert probability to percentage
+                probability: float = (
+                    count / len(combined_dict["all"])
+                ) * 100  # Convert probability to percentage
                 if probability >= min_transition_percent:
                     G.add_edge(state, state, weight=probability)
 
@@ -139,11 +159,19 @@ def generate_transition_graph(
                 forward_count: int = transitions.get(forward_transition, 0)
                 backward_count: int = transitions.get(backward_transition, 0)
 
-                transition_probabilities_forward[forward_transition] = (forward_count / node_occurrences[start_state]) * 100
-                transition_occurrences_forward[forward_transition] = (forward_count / len(combined_dict["all"])) * 100
+                transition_probabilities_forward[forward_transition] = (
+                    forward_count / node_occurrences[start_state]
+                ) * 100
+                transition_occurrences_forward[forward_transition] = (
+                    forward_count / len(combined_dict["all"])
+                ) * 100
 
-                transition_probabilities_backward[backward_transition] = (backward_count / node_occurrences[end_state]) * 100
-                transition_occurrences_backward[backward_transition] = (backward_count / len(combined_dict["all"])) * 100
+                transition_probabilities_backward[backward_transition] = (
+                    backward_count / node_occurrences[end_state]
+                ) * 100
+                transition_occurrences_backward[backward_transition] = (
+                    backward_count / len(combined_dict["all"])
+                ) * 100
 
             # Calculate self-loop probabilities
             self_loop_probabilities: Dict[str, float] = {}
@@ -155,10 +183,15 @@ def generate_transition_graph(
 
             # Generate the Markov Chain plot
             plt.figure(figsize=(60, 60))  # Increased figure size
-            plt.title(f"Markov Chain Plot {min_transition_percent}% Frames Transition", fontsize=72)
+            plt.title(
+                f"Markov Chain Plot {min_transition_percent}% Frames Transition",
+                fontsize=72,
+            )
 
             # Draw nodes and edges
-            pos: Dict[str, Tuple[float, float]] = nx.spring_layout(G, k=2, seed=42)  # Increased distance between nodes (k=2)
+            pos: Dict[str, Tuple[float, float]] = nx.spring_layout(
+                G, k=2, seed=42
+            )  # Increased distance between nodes (k=2)
             edge_colors: List[str] = []
 
             for u, v, data in G.edges(data=True):
@@ -167,7 +200,9 @@ def generate_transition_graph(
                 if u == v:  # Check if it is a self-loop
                     edge_colors.append("green")  # Set green color for self-loop arrows
                     width: float = 0.1  # Make self-loop arrows smaller
-                    connection_style: str = "arc3,rad=-0.1"  # Make the self-loops more curved and compact
+                    connection_style: str = (
+                        "arc3,rad=-0.1"  # Make the self-loops more curved and compact
+                    )
                     nx.draw_networkx_edges(
                         G,
                         pos,
@@ -178,13 +213,19 @@ def generate_transition_graph(
                         connectionstyle=connection_style,
                     )
                 elif weight >= min_transition_percent:
-                    edge_colors.append("black")  # Highlight significant transitions in black
+                    edge_colors.append(
+                        "black"
+                    )  # Highlight significant transitions in black
 
                     # Check if both nodes are present before adding labels
                     if G.has_node(u) and G.has_node(v):
                         width: float = 4.0
-                        forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
-                        backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        forward_label: str = (
+                            f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
+                        )
+                        backward_label: str = (
+                            f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        )
                         edge_label: str = f"{forward_label}\n{backward_label}"
 
                         connection_style: str = "arc3,rad=-0.1"
@@ -197,15 +238,23 @@ def generate_transition_graph(
                             edge_color=edge_colors[-1],
                             connectionstyle=connection_style,
                         )
-                        nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=26)
+                        nx.draw_networkx_edge_labels(
+                            G, pos, edge_labels={(u, v): edge_label}, font_size=26
+                        )
                 else:
-                    edge_colors.append("grey")  # Use grey for non-significant transitions
+                    edge_colors.append(
+                        "grey"
+                    )  # Use grey for non-significant transitions
                     width: float = 0.5
 
                     # Check if both nodes are present before adding labels
                     if G.has_node(u) and G.has_node(v):
-                        forward_label: str = f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
-                        backward_label: str = f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        forward_label: str = (
+                            f"{transition_occurrences_forward.get((v, u), 0):.2f}% of Frames →, {transition_probabilities_forward.get((v, u), 0):.2f}% probability"
+                        )
+                        backward_label: str = (
+                            f"{transition_occurrences_backward.get((u, v), 0):.2f}% of Frames ←, {transition_probabilities_backward.get((u, v), 0):.2f}% probability"
+                        )
                         edge_label: str = f"{forward_label}\n{backward_label}"
 
                         connection_style: str = "arc3,rad=-0.1"
@@ -218,15 +267,23 @@ def generate_transition_graph(
                             edge_color=edge_colors[-1],
                             connectionstyle=connection_style,
                         )
-                        nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): edge_label}, font_size=36)
+                        nx.draw_networkx_edge_labels(
+                            G, pos, edge_labels={(u, v): edge_label}, font_size=36
+                        )
 
             # Update the node colors based on their appearance percentages in each part
             node_colors: List[str] = []
             for node in G.nodes():
                 if node in top_10_nodes:
-                    part1_percentage: float = (part1_node_occurrences.get(node, 0) / node_occurrences[node])
-                    part2_percentage: float = (part2_node_occurrences.get(node, 0) / node_occurrences[node])
-                    part3_percentage: float = (part3_node_occurrences.get(node, 0) / node_occurrences[node])
+                    part1_percentage: float = (
+                        part1_node_occurrences.get(node, 0) / node_occurrences[node]
+                    )
+                    part2_percentage: float = (
+                        part2_node_occurrences.get(node, 0) / node_occurrences[node]
+                    )
+                    part3_percentage: float = (
+                        part3_node_occurrences.get(node, 0) / node_occurrences[node]
+                    )
 
                     if part1_percentage > 0.5:
                         node_colors.append("green")
@@ -240,8 +297,12 @@ def generate_transition_graph(
                     node_colors.append("skyblue")
 
             # Draw nodes with sizes correlated to occurrences and color top 10 nodes
-            node_size: List[int] = [size_node * node_occurrences[node] for node in G.nodes()]
-            nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_colors, alpha=0.8)
+            node_size: List[int] = [
+                size_node * node_occurrences[node] for node in G.nodes()
+            ]
+            nx.draw_networkx_nodes(
+                G, pos, node_size=node_size, node_color=node_colors, alpha=0.8
+            )
 
             # Draw node labels with occurrence percentage and self-loop probability for nodes with edges
             node_labels: Dict[str, str] = {}
@@ -261,10 +322,18 @@ def generate_transition_graph(
 
                     if relevant_edges:
                         if node in top_10_nodes:
-                            node_occurrence_percentage: float = (node_occurrences[node] / len(combined_dict["all"])) * 100
-                            self_loop_probability: float = self_loop_probabilities.get(node, 0) * 100
-                            self_loop_occurence: float = self_loop_occurences.get(node, 0)
-                            node_label: str = f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%"
+                            node_occurrence_percentage: float = (
+                                node_occurrences[node] / len(combined_dict["all"])
+                            ) * 100
+                            self_loop_probability: float = (
+                                self_loop_probabilities.get(node, 0) * 100
+                            )
+                            self_loop_occurence: float = self_loop_occurences.get(
+                                node, 0
+                            )
+                            node_label: str = (
+                                f"{node}\nOccurrences: {node_occurrence_percentage:.2f}%\nSelf-Loop Probability: {self_loop_probability:.2f}% \nSelf-Loop Occurrence: {self_loop_occurence:.2f}%"
+                            )
                             node_labels[node] = node_label
                         else:
                             node_labels[node] = node
@@ -285,9 +354,13 @@ def generate_transition_graph(
             plt.tight_layout()
 
             # Save the plot
-            plot_filename: str = f"markov_chain_plot_{min_transition_percent}.{fig_type}"
+            plot_filename: str = (
+                f"markov_chain_plot_{min_transition_percent}.{fig_type}"
+            )
             plot_path: str = os.path.join("Binding_Modes_Markov_States", plot_filename)
-            os.makedirs("Binding_Modes_Markov_States", exist_ok=True)  # Create the folder if it doesn't exist
+            os.makedirs(
+                "Binding_Modes_Markov_States", exist_ok=True
+            )  # Create the folder if it doesn't exist
 
             plt.savefig(plot_path, dpi=300)
             plt.clf()  # Clear the current figure for the next iteration
diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py
index 0b5c3940..70286a4b 100644
--- a/openmmdl/openmmdl_analysis/openmmdlanalysis.py
+++ b/openmmdl/openmmdl_analysis/openmmdlanalysis.py
@@ -373,7 +373,7 @@ def main():
     interaction_analysis = InteractionAnalyzer(
         pdb_md, dataframe, cpu_count, ligand, special_ligand, peptide
     )
-    interaction_list = interaction_analysis.ineraction_list
+    interaction_list = interaction_analysis.interaction_list
     interaction_list.to_csv("missing_frames_filled.csv")
     interaction_list = interaction_list.reset_index(drop=True)
 
diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py
index fb09eacf..76d00263 100644
--- a/openmmdl/openmmdl_analysis/pharmacophore.py
+++ b/openmmdl/openmmdl_analysis/pharmacophore.py
@@ -4,6 +4,7 @@
 import numpy as np
 from typing import Dict, List, Optional, Union
 
+
 class PharmacophoreGenerator:
     def __init__(self, df_all: pd.DataFrame, ligand_name: str):
         self.df_all = df_all
@@ -12,7 +13,9 @@ def __init__(self, df_all: pd.DataFrame, ligand_name: str):
         self.coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)")
         self.clouds = self._generate_clouds()
 
-    def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
+    def _generate_clouds(
+        self,
+    ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         interaction_coords = {
             "hydrophobic": [],
             "acceptor": [],
@@ -57,7 +60,9 @@ def _generate_clouds(self) -> Dict[str, Dict[str, Union[List[List[float]], List[
 
         return self._format_clouds(interaction_coords)
 
-    def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
+    def _format_clouds(
+        self, interaction_coords: Dict[str, List[List[float]]]
+    ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         color_mapping = {
             "hydrophobic": [1.0, 1.0, 0.0],
             "acceptor": [1.0, 0.0, 0.0],
@@ -79,10 +84,14 @@ def _format_clouds(self, interaction_coords: Dict[str, List[List[float]]]) -> Di
             for interaction, coords in interaction_coords.items()
         }
 
-    def to_dict(self) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
+    def to_dict(
+        self,
+    ) -> Dict[str, Dict[str, Union[List[List[float]], List[float], float]]]:
         return self.clouds
 
-    def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, List[float]]:
+    def generate_pharmacophore_centers(
+        self, interactions: List[str]
+    ) -> Dict[str, List[float]]:
         """Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions.
 
         Args:
@@ -113,7 +122,9 @@ def generate_pharmacophore_centers(self, interactions: List[str]) -> Dict[str, L
                 pharmacophore[interaction] = [center_x, center_y, center_z]
         return pharmacophore
 
-    def generate_pharmacophore_vectors(self, interactions: List[str]) -> Dict[str, List[List[float]]]:
+    def generate_pharmacophore_vectors(
+        self, interactions: List[str]
+    ) -> Dict[str, List[List[float]]]:
         """Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors.
 
         Args:
@@ -332,7 +343,7 @@ def generate_bindingmode_pharmacophore(
         self,
         dict_bindingmode: Dict[str, Dict[str, List[List[float]]]],
         outname: str,
-        id_num: int = 0
+        id_num: int = 0,
     ) -> None:
         """Generates pharmacophore from a binding mode and writes it to a .pml file.
 
@@ -366,10 +377,12 @@ def generate_bindingmode_pharmacophore(
 
         for interaction, coords in dict_bindingmode.items():
             # Get feature type
-            feature_type = next((ft for it, ft in feature_types.items() if it in interaction), None)
+            feature_type = next(
+                (ft for it, ft in feature_types.items() if it in interaction), None
+            )
             if feature_type is None:
                 continue
-            
+
             # Generate vector features
             if feature_type in ["HBA", "HBD"]:
                 if feature_type == "HBA":
@@ -479,7 +492,9 @@ def generate_bindingmode_pharmacophore(
             xml_declaration=True,
         )
 
-     def generate_pharmacophore_centers_all_points(self, interactions: List[str]) -> Dict[str, List[List[float]]]:
+    def generate_pharmacophore_centers_all_points(
+        self, interactions: List[str]
+    ) -> Dict[str, List[List[float]]]:
         """Generates pharmacophore points for all interactions to generate point cloud.
 
         Args:
diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py
index 0c7530b1..d24c16b3 100644
--- a/openmmdl/openmmdl_analysis/preprocessing.py
+++ b/openmmdl/openmmdl_analysis/preprocessing.py
@@ -9,6 +9,7 @@
 from openbabel import pybel
 import mdtraj as md
 
+
 class Preprocessing:
     def __init__(self):
         pass
@@ -31,9 +32,31 @@ def renumber_protein_residues(
         ref_top_df, _ = traj_reference.topology.to_dataframe()
 
         protein_residues = [
-            "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
-            "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL",
-            "ACE", "NME", "HIE", "HID", "HIP"
+            "ALA",
+            "ARG",
+            "ASN",
+            "ASP",
+            "CYS",
+            "GLN",
+            "GLU",
+            "GLY",
+            "HIS",
+            "ILE",
+            "LEU",
+            "LYS",
+            "MET",
+            "PHE",
+            "PRO",
+            "SER",
+            "THR",
+            "TRP",
+            "TYR",
+            "VAL",
+            "ACE",
+            "NME",
+            "HIE",
+            "HID",
+            "HIP",
         ]
 
         for chain_id in ref_top_df["chainID"].unique():
@@ -41,10 +64,13 @@ def renumber_protein_residues(
                 ref_top_df[
                     (ref_top_df["chainID"] == chain_id)
                     & ref_top_df["resName"].isin(protein_residues)
-                ]["resSeq"].values - 1
+                ]["resSeq"].values
+                - 1
             )
 
-            mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(protein_residues)
+            mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(
+                protein_residues
+            )
             input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1
 
         new_top = md.Topology.from_dataframe(input_top_df)
@@ -113,7 +139,9 @@ def extract_and_save_ligand_as_sdf(
         ligand_atoms = u.select_atoms(f"resname {target_resname}")
 
         if len(ligand_atoms) == 0:
-            print(f"No ligand with residue name '{target_resname}' found in the PDB file.")
+            print(
+                f"No ligand with residue name '{target_resname}' found in the PDB file."
+            )
             return
 
         ligand_universe = mda.Merge(ligand_atoms)
@@ -149,7 +177,9 @@ def renumber_atoms_in_residues(
                 if residue_name == lig_name:
                     element_match = re.match(r"([A-Z]+)", atom_name)
                     element = element_match.group(1) if element_match else atom_name
-                    lig_residue_elements[element] = lig_residue_elements.get(element, 0) + 1
+                    lig_residue_elements[element] = (
+                        lig_residue_elements.get(element, 0) + 1
+                    )
                     new_atom_name = f"{element}{lig_residue_elements[element]}"
                     line = line[:12] + f"{new_atom_name:4}" + line[16:]
 
diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
index d630f30a..694b0337 100644
--- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
+++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
@@ -7,7 +7,8 @@
 import pylab
 import os
 import MDAnalysis as mda
-
+from typing import List, Dict, Tuple
+from typing import List, Optional
 
 class LigandImageGenerator:
     def __init__(
@@ -104,8 +105,8 @@ def generate_image(self) -> None:
             print(f"Error: {e}")
 
 
-import MDAnalysis as mda
-from typing import List, Dict, Tuple
+
+
 
 class InteractionProcessor:
     def __init__(self, complex_pdb_file: str, ligand_no_h_pdb_file: str):
@@ -141,8 +142,20 @@ def split_interaction_data(self, data: List[str]) -> List[str]:
             split_data.append(split_value)
         return split_data
 
-    def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> Tuple[
-        List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int], List[int]
+    def highlight_numbers(
+        self, split_data: List[str], starting_idx: List[int]
+    ) -> Tuple[
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
+        List[int],
     ]:
         """Extracts the data from the split_data output of the interactions and categorizes it to its respective list.
 
@@ -209,7 +222,9 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T
                         if atom_name is None:
                             continue
                         lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                        lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                        lig_real_index = (
+                            lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                        )
                         if lig_real_index is not None:
                             highlighted_pistacking.append(lig_real_index)
 
@@ -223,11 +238,15 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T
                         for code in split_codes:
                             atom_index = int(code)
                             complex_id = self.complex.select_atoms(f"id {atom_index}")
-                            atom_name = complex_id[0].name if len(complex_id) > 0 else None
+                            atom_name = (
+                                complex_id[0].name if len(complex_id) > 0 else None
+                            )
                             if atom_name is None:
                                 continue
                             lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                            lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            lig_real_index = (
+                                lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            )
                             if lig_real_index is not None:
                                 highlighted_ni.append(lig_real_index)
 
@@ -235,11 +254,15 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T
                         for code in numeric_codes:
                             atom_index = int(code)
                             complex_id = self.complex.select_atoms(f"id {atom_index}")
-                            atom_name = complex_id[0].name if len(complex_id) > 0 else None
+                            atom_name = (
+                                complex_id[0].name if len(complex_id) > 0 else None
+                            )
                             if atom_name is None:
                                 continue
                             lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                            lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            lig_real_index = (
+                                lig_atom[0].id - 1 if len(lig_atom) > 0 else None
+                            )
                             if lig_real_index is not None:
                                 highlighted_pi.append(lig_real_index)
 
@@ -270,7 +293,9 @@ def highlight_numbers(self, split_data: List[str], starting_idx: List[int]) -> T
             highlighted_metal,
         )
 
-    def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> Dict[int, Tuple[float, float, float]]:
+    def generate_interaction_dict(
+        self, interaction_type: str, keys: List[int]
+    ) -> Dict[int, Tuple[float, float, float]]:
         """Generates a dictionary of interaction RGB color model based on the provided interaction type.
 
         Args:
@@ -297,11 +322,13 @@ def generate_interaction_dict(self, interaction_type: str, keys: List[int]) -> D
         if interaction_type not in interaction_dict:
             raise ValueError(f"Unknown interaction type: {interaction_type}")
 
-        return {
-            int(key): interaction_dict[interaction_type] for key in keys
-        }
+        return {int(key): interaction_dict[interaction_type] for key in keys}
 
-    def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *source_dicts: Dict[int, Tuple[float, float, float]]):
+    def update_dict(
+        self,
+        target_dict: Dict[int, Tuple[float, float, float]],
+        *source_dicts: Dict[int, Tuple[float, float, float]],
+    ):
         """Updates the dictionary with the keys and values from other dictionaries.
 
         Args:
@@ -321,18 +348,13 @@ def update_dict(self, target_dict: Dict[int, Tuple[float, float, float]], *sourc
                     target_dict[key] = value
 
 
-from typing import List, Optional
-from PIL import Image
-import pylab
-import os
-
 class ImageMerger:
     def __init__(
-        self, 
-        binding_mode: str, 
-        occurrence_percent: float, 
-        split_data: List[str], 
-        merged_image_paths: List[str]
+        self,
+        binding_mode: str,
+        occurrence_percent: float,
+        split_data: List[str],
+        merged_image_paths: List[str],
     ) -> None:
         self.binding_mode = binding_mode
         self.occurrence_percent = occurrence_percent
@@ -361,7 +383,9 @@ def create_and_merge_images(self) -> List[str]:
         for i, data in enumerate(filtered_split_data):
             y = data_points[i]
             label = data.split()[-1]
-            type_ = data.split()[-2]  # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function
+            type_ = data.split()[
+                -2
+            ]  # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function
             if label == "hydrophobic":
                 (line,) = ax.plot(
                     x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0
diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py
index cbcc4505..4715fbc1 100644
--- a/openmmdl/openmmdl_analysis/rmsd_calculation.py
+++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py
@@ -27,7 +27,9 @@ class RMSDAnalyzer:
     def __init__(self, prot_lig_top_file: str, prot_lig_traj_file: str) -> None:
         self.prot_lig_top_file: str = prot_lig_top_file
         self.prot_lig_traj_file: str = prot_lig_traj_file
-        self.universe: mda.Universe = mda.Universe(prot_lig_top_file, prot_lig_traj_file)
+        self.universe: mda.Universe = mda.Universe(
+            prot_lig_top_file, prot_lig_traj_file
+        )
 
     def rmsd_for_atomgroups(
         self, fig_type: str, selection1: str, selection2: Optional[List[str]] = None
diff --git a/openmmdl/openmmdl_analysis/visualization.ipynb b/openmmdl/openmmdl_analysis/visualization.ipynb
index 8040c014..ab4e7b09 100644
--- a/openmmdl/openmmdl_analysis/visualization.ipynb
+++ b/openmmdl/openmmdl_analysis/visualization.ipynb
@@ -16,10 +16,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ligname = 'UNK' # add the name of the ligand from the original PDB File\n",
-    "special = None # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n",
-    "md = mda.Universe('PATH_TO_interacting_waters.pdb', 'PATH_TO_interacting_waters.dcd')\n",
-    "cloud = 'PATH_TO_clouds.json'\n",
+    "ligname = \"UNK\"  # add the name of the ligand from the original PDB File\n",
+    "special = None  # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n",
+    "md = mda.Universe(\"PATH_TO_interacting_waters.pdb\", \"PATH_TO_interacting_waters.dcd\")\n",
+    "cloud = \"PATH_TO_clouds.json\"\n",
     "\n",
     "visualizer = Visualizer(md, cloud, ligname, special)"
    ]
@@ -61,7 +61,7 @@
    "source": [
     "# Predefined variables: receptor_type=\"protein or nucleic\", height=\"1000px\", width=\"1000px\"\n",
     "view = visualizer.visualize()\n",
-    "view.display(gui=True, style='ngl')"
+    "view.display(gui=True, style=\"ngl\")"
    ]
   }
  ],
diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py
index 4b4f8441..37079ef2 100644
--- a/openmmdl/openmmdl_analysis/visualization_functions.py
+++ b/openmmdl/openmmdl_analysis/visualization_functions.py
@@ -11,7 +11,9 @@
 
 
 class TrajectorySaver:
-    def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool) -> None:
+    def __init__(
+        self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool
+    ) -> None:
         """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type.
 
         Args:
@@ -25,7 +27,9 @@ def __init__(self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bo
         self.special = special
         self.nucleic = nucleic
 
-    def save_interacting_waters_trajectory(self, interacting_waters: List[int], outputpath: str = './Visualization/') -> None:
+    def save_interacting_waters_trajectory(
+        self, interacting_waters: List[int], outputpath: str = "./Visualization/"
+    ) -> None:
         """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters.
 
         Args:
@@ -48,7 +52,9 @@ def save_interacting_waters_trajectory(self, interacting_waters: List[int], outp
             for ts in self.pdb_md.trajectory:
                 W.write(water_atoms)
 
-    def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None) -> None:
+    def save_frame(
+        self, frame: int, outpath: str, selection: Optional[str] = None
+    ) -> None:
         """Saves a single frame of the trajectory.
 
         Args:
@@ -65,13 +71,21 @@ def save_frame(self, frame: int, outpath: str, selection: Optional[str] = None)
 
 
 class Visualizer:
-    def __init__(self, md: mda.Universe, cloud_path: str, ligname: str, special: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        md: mda.Universe,
+        cloud_path: str,
+        ligname: str,
+        special: Optional[str] = None,
+    ) -> None:
         self.md = md
         self.cloud = self.load_cloud(cloud_path)
         self.ligname = ligname
         self.special = special
 
-    def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float], List[int]]]]:
+    def load_cloud(
+        self, cloud_path: str
+    ) -> Dict[str, Dict[str, Union[List[float], List[int]]]]:
         """Loads interaction cloud data from a JSON file.
 
         Args:
@@ -85,7 +99,10 @@ def load_cloud(self, cloud_path: str) -> Dict[str, Dict[str, Union[List[float],
         return data
 
     def visualize(
-        self, receptor_type: str = "protein or nucleic", height: str = "1000px", width: str = "1000px"
+        self,
+        receptor_type: str = "protein or nucleic",
+        height: str = "1000px",
+        width: str = "1000px",
     ) -> nv.NGLWidget:
         """Generates visualization of the trajectory with the interacting waters and interaction clouds.
 

From babc360b47ca459ceb81b034916f5ce93d52f031 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 18 Sep 2024 11:40:12 +0200
Subject: [PATCH 13/23] Update barcode_generation.py

adjusted changes to fix some errors
---
 openmmdl/openmmdl_analysis/barcode_generation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py
index ee804f9b..8bf6c78b 100644
--- a/openmmdl/openmmdl_analysis/barcode_generation.py
+++ b/openmmdl/openmmdl_analysis/barcode_generation.py
@@ -82,7 +82,7 @@ def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]:
         water_id_list = []
         waterid_barcode = []
 
-        for _, row in self.df.iterrows():
+        for index, row in self.df.iterrows():
             if row[interaction] == 1:
                 water_id_list.append(int(float(row["WATER_IDX"])))
 
@@ -296,11 +296,11 @@ def plot_barcodes_grouped(
                     ligatom.pop(-1)
             ligatom = "_".join(ligatom)
             ligatoms_dict.setdefault(ligatom, []).append(interaction)
-
-        total_interactions: Dict[str, np.ndarray] = {}
-        for ligatom, interactions_list in ligatoms_dict.items():
-            ligatom_interaction_barcodes: Dict[str, np.ndarray] = {}
-            for interaction in interactions_list:
+        
+        total_interactions = {}
+        for ligatom in ligatoms_dict:
+            ligatom_interaction_barcodes = {}
+            for interaction in ligatoms_dict[ligatom]:
                 barcode = self.barcode_gen.generate_barcode(interaction)
                 ligatom_interaction_barcodes[interaction] = barcode
             os.makedirs(f"./Barcodes/{ligatom}", exist_ok=True)

From ed3adfb2ddfe881cf23cf929e5c57273bad6ce15 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:05:44 +0200
Subject: [PATCH 14/23] Update binding_mode_processing.py

returned to original
---
 .../binding_mode_processing.py                | 399 +++++++++++-------
 1 file changed, 252 insertions(+), 147 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/binding_mode_processing.py b/openmmdl/openmmdl_analysis/binding_mode_processing.py
index 135694d0..98a90c1a 100644
--- a/openmmdl/openmmdl_analysis/binding_mode_processing.py
+++ b/openmmdl/openmmdl_analysis/binding_mode_processing.py
@@ -7,47 +7,41 @@
 from tqdm import tqdm
 from pathlib import Path
 from numba import jit
-from typing import Dict, List, Union, Optional, Tuple
 
 
 class BindingModeProcesser:
+
     def __init__(
-        self,
-        pdb_md,
-        ligand: str,
-        peptide: Optional[str],
-        special: str,
-        ligand_rings: List[List[int]],
-        interaction_list: pd.DataFrame,
-        threshold: float,
-    ) -> None:
+        self, pdb_md, ligand, peptide, special, ligand_rings, interaction_list, treshold
+    ):
         self.pdb_md = pdb_md
         self.ligand = ligand
         self.peptide = peptide
         self.special = special
         self.ligand_rings = ligand_rings
         self.unique_columns_rings_grouped = self.gather_interactions(interaction_list)
-        self.interaction_list, self.unique_data = self.process_interaction_wrapper(
-            interaction_list, threshold / 100
+        self.interaction_list, self.unique_data = self.process_interaction_wraper(
+            interaction_list, (treshold / 100)
         )
-        self.interactions_all, self.unique_data_all = self.process_interaction_wrapper(
+        self.interactions_all, self.unique_data_all = self.process_interaction_wraper(
             interaction_list.copy(), 0.00001
         )
 
-    def process_interaction_wrapper(
-        self, interaction_list: pd.DataFrame, threshold: float
-    ) -> Tuple[pd.DataFrame, Dict[str, str]]:
-        filtered_values = self.filtering_values(threshold, interaction_list)
+    def process_interaction_wraper(self, interaction_list, treshold):
+
+        filtered_values = self.filtering_values(treshold, interaction_list)
         interaction_list.fillna(0, inplace=True)
         unique_data = self.unique_data_generation(filtered_values)
         self.df_iteration_numbering(interaction_list, unique_data)
         return interaction_list, unique_data
 
-    def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
+    def gather_interactions(self, df):
         """Process a DataFrame with the protein-ligand interaction and generate column names for each unique interaction.
 
         Args:
-            df (pandas.DataFrame): DataFrame that contains the interaction data for the whole trajectory.
+            df (pandas dataframe): DataFrame that contains the interaction data for the whole trajectory.
+            ligand_rings (list): A list of the ligand ring information to recognize the atom numbers belonging to rings for hydrophobic interactions.
+            peptide (str, optional): chainid of the peptide in the topology. Defaults to None.
 
         Returns:
             dict: A dictionary with the keys being 'FRAME' numbers and values being dictionaries containing row indices and their corresponding unique column names for interactions.
@@ -55,17 +49,23 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
         unique_columns_rings = {}
         unique_columns_rings_grouped = {}
 
+        # Iterate through the rows of the DataFrame
         if self.peptide is None:
             for index, row in df.iterrows():
-                interaction = row["INTERACTION"]
-                prot_partner = row["Prot_partner"]
-
-                if interaction == "hydrophobic":
+                # Check if the 'INTERACTION' is 'hydrophobic'
+                if row["INTERACTION"] == "hydrophobic":
+                    # Get the values from the current row
+                    prot_partner = row["Prot_partner"]
                     ligcarbonidx = row["LIGCARBONIDX"]
+                    interaction = row["INTERACTION"]
                     ring_found = False
+                    # Concatenate the values to form a unique column name
                     for ligand_ring in self.ligand_rings:
                         if ligcarbonidx in ligand_ring:
-                            numbers_as_strings = [str(idx) for idx in ligand_ring]
+                            numbers_as_strings = [
+                                str(ligcarbonidx) for ligcarbonidx in ligand_ring
+                            ]
+                            # Create the name with numbers separated by underscores
                             name_with_numbers = "_".join(numbers_as_strings)
                             col_name = (
                                 f"{prot_partner}_{name_with_numbers}_{interaction}"
@@ -73,121 +73,189 @@ def gather_interactions(self, df: pd.DataFrame) -> Dict[int, Dict[int, str]]:
                             ring_found = True
                             break
                     if not ring_found:
+                        ligcarbonidx = int(row["LIGCARBONIDX"])
                         col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}"
-                elif interaction == "hbond":
-                    if row["PROTISDON"]:
+                elif row["INTERACTION"] == "hbond":
+                    if row["PROTISDON"] == True:
+                        prot_partner = row["Prot_partner"]
                         ligcarbonidx = int(row["ACCEPTORIDX"])
-                        type_ = "Acceptor"
-                    else:
+                        interaction = row["INTERACTION"]
+                        type = "Acceptor"
+                    elif row["PROTISDON"] == False:
+                        prot_partner = row["Prot_partner"]
                         ligcarbonidx = int(row["DONORIDX"])
-                        type_ = "Donor"
-                    col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}"
-                elif interaction == "halogen":
+                        interaction = row["INTERACTION"]
+                        type = "Donor"
+                    # Concatenate the values to form a unique column name
+                    col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
+                elif row["INTERACTION"] == "halogen":
+                    prot_partner = row["Prot_partner"]
                     ligcarbonidx = int(row["DON_IDX"])
                     halogen = row["DONORTYPE"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}"
-                elif interaction == "waterbridge":
-                    if row["PROTISDON"]:
+                elif row["INTERACTION"] == "waterbridge":
+                    if row["PROTISDON"] == True:
+                        prot_partner = row["Prot_partner"]
                         ligcarbonidx = int(row["ACCEPTOR_IDX"])
-                        type_ = "Acceptor"
-                    else:
+                        interaction = row["INTERACTION"]
+                        type = "Acceptor"
+                        # Concatenate the values to form a unique column name
+                        col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
+                    elif row["PROTISDON"] == False:
+                        prot_partner = row["Prot_partner"]
                         ligcarbonidx = int(row["DONOR_IDX"])
-                        type_ = "Donor"
-                    col_name = f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}"
-                elif interaction == "pistacking":
+                        interaction = row["INTERACTION"]
+                        type = "Donor"
+                        # Concatenate the values to form a unique column name
+                        col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}"
+                elif row["INTERACTION"] == "pistacking":
+                    prot_partner = row["Prot_partner"]
                     ligcarbonidx = row["LIG_IDX_LIST"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}"
-                elif interaction == "pication":
+                elif row["INTERACTION"] == "pication":
+                    prot_partner = row["Prot_partner"]
                     ligidx = row["LIG_IDX_LIST"]
                     ligtype = row["LIG_GROUP"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}"
                     col_name = col_name.replace(",", "_")
-                elif interaction == "saltbridge":
+                elif row["INTERACTION"] == "saltbridge":
+                    prot_partner = row["Prot_partner"]
                     ligidx = row["LIG_IDX_LIST"]
                     lig_group = row["LIG_GROUP"]
-                    if row["PROTISPOS"]:
-                        type_ = "NI"
-                    else:
-                        type_ = "PI"
-                    col_name = (
-                        f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
-                    )
-                elif interaction == "metal":
+                    interaction = row["INTERACTION"]
+                    if row["PROTISPOS"] == True:
+                        type = "NI"
+                        # Concatenate the values to form a unique column name
+                        col_name = (
+                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
+                        )
+                    elif row["PROTISPOS"] == False:
+                        type = "PI"
+                        col_name = (
+                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
+                        )
+                elif row["INTERACTION"] == "metal":
                     special_ligand = row["RESTYPE_LIG"]
                     ligcarbonidx = int(row["TARGET_IDX"])
                     metal_type = row["METAL_TYPE"]
                     coordination = row["COORDINATION"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}"
-
                 frame_value = row["FRAME"]
                 if frame_value not in unique_columns_rings_grouped:
                     unique_columns_rings_grouped[frame_value] = {}
-                if interaction != "skip":
+                if row["INTERACTION"] != "skip":
                     unique_columns_rings_grouped[frame_value][index] = col_name
+                    # Add the column name and its value to the dictionary
                     unique_columns_rings[index] = col_name
-
-        else:  # If peptide is not None
+        if self.peptide is not None:
             for index, row in df.iterrows():
-                interaction = row["INTERACTION"]
-                prot_partner = row["Prot_partner"]
-                peptide_partner = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
-
-                if interaction == "hydrophobic":
+                # Check if the 'INTERACTION' is 'hydrophobic'
+                if row["INTERACTION"] == "hydrophobic":
+                    # Get the values from the current row
+                    prot_partner = row["Prot_partner"]
+                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                    interaction = row["INTERACTION"]
+                    ring_found = False
                     col_name = f"{prot_partner}_{peptide_partner}_{interaction}"
-                elif interaction == "hbond":
-                    if row["PROTISDON"]:
-                        type_ = "Acceptor"
-                    else:
-                        type_ = "Donor"
-                    col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}"
-                elif interaction == "halogen":
+                elif row["INTERACTION"] == "hbond":
+                    if row["PROTISDON"] == True:
+                        prot_partner = row["Prot_partner"]
+                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                        interaction = row["INTERACTION"]
+                        type = "Acceptor"
+                    elif row["PROTISDON"] == False:
+                        prot_partner = row["Prot_partner"]
+                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                        interaction = row["INTERACTION"]
+                        type = "Donor"
+                    # Concatenate the values to form a unique column name
+                    col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
+                elif row["INTERACTION"] == "halogen":
+                    prot_partner = row["Prot_partner"]
+                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
                     halogen = row["DONORTYPE"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = (
                         f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}"
                     )
-                elif interaction == "waterbridge":
-                    if row["PROTISDON"]:
-                        type_ = "Acceptor"
-                    else:
-                        type_ = "Donor"
-                    col_name = f"{prot_partner}_{peptide_partner}_{type_}_{interaction}"
-                elif interaction == "pistacking":
+                elif row["INTERACTION"] == "waterbridge":
+                    if row["PROTISDON"] == True:
+                        prot_partner = row["Prot_partner"]
+                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                        interaction = row["INTERACTION"]
+                        type = "Acceptor"
+                        # Concatenate the values to form a unique column name
+                        col_name = (
+                            f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
+                        )
+                    elif row["PROTISDON"] == False:
+                        prot_partner = row["Prot_partner"]
+                        peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                        interaction = row["INTERACTION"]
+                        type = "Donor"
+                        # Concatenate the values to form a unique column name
+                        col_name = (
+                            f"{prot_partner}_{peptide_partner}_{type}_{interaction}"
+                        )
+                elif row["INTERACTION"] == "pistacking":
+                    prot_partner = row["Prot_partner"]
+                    peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{peptide_partner}_{interaction}"
-                elif interaction == "pication":
-                    ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
+                elif row["INTERACTION"] == "pication":
+                    prot_partner = row["Prot_partner"]
+                    ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
                     ligtype = row["RESTYPE_LIG"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}"
                     col_name = col_name.replace(",", "_")
-                elif interaction == "saltbridge":
-                    ligidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
+                elif row["INTERACTION"] == "saltbridge":
+                    prot_partner = row["Prot_partner"]
+                    ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
                     lig_group = row["RESTYPE_LIG"]
-                    if row["PROTISPOS"]:
-                        type_ = "NI"
-                    else:
-                        type_ = "PI"
-                    col_name = (
-                        f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}"
-                    )
-                elif interaction == "metal":
+                    interaction = row["INTERACTION"]
+                    if row["PROTISPOS"] == True:
+                        type = "NI"
+                        # Concatenate the values to form a unique column name
+                        col_name = (
+                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
+                        )
+                    elif row["PROTISPOS"] == False:
+                        type = "PI"
+                        col_name = (
+                            f"{prot_partner}_{ligidx}_{lig_group}_{type}_{interaction}"
+                        )
+                elif row["INTERACTION"] == "metal":
                     special_ligand = row["RESTYPE_LIG"]
-                    ligcarbonidx = f"{row['RESNR_LIG']}{row['RESTYPE_LIG']}"
+                    ligcarbonidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]
                     metal_type = row["METAL_TYPE"]
                     coordination = row["COORDINATION"]
+                    interaction = row["INTERACTION"]
+                    # Concatenate the values to form a unique column name
                     col_name = f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}"
-
                 frame_value = row["FRAME"]
                 if frame_value not in unique_columns_rings_grouped:
                     unique_columns_rings_grouped[frame_value] = {}
-                if interaction != "skip":
+                if row["INTERACTION"] != "skip":
                     unique_columns_rings_grouped[frame_value][index] = col_name
+                    # Add the column name and its value to the dictionary
                     unique_columns_rings[index] = col_name
-
         print("\033[1minteraction partners generated\033[0m")
+
         return unique_columns_rings_grouped
 
-    def remove_duplicate_values(
-        self, data: Dict[int, Dict[int, str]]
-    ) -> Dict[int, Dict[int, str]]:
+    def remove_duplicate_values(self, data):
         """Remove the duplicate values from sub-dictionaries within the input dictionary.
 
         Args:
@@ -212,9 +280,7 @@ def remove_duplicate_values(
 
         return unique_data
 
-    def combine_subdict_values(
-        self, data: Dict[int, Dict[int, str]]
-    ) -> Dict[str, List[str]]:
+    def combine_subdict_values(self, data):
         """Combines the values from the individual sub-dictionaries into a single list.
 
         Args:
@@ -229,37 +295,52 @@ def combine_subdict_values(
 
         return combined_data
 
-    def filtering_values(self, threshold: float, df: pd.DataFrame) -> List[str]:
+    def filtering_values(self, threshold, df):
         """Filter and append values (interactions) to a DataFrame based on occurrence counts.
 
         Args:
-            threshold (float): A threshold value used for filtering the values (interactions) based on occurrence count.
-            df (pandas.DataFrame): DataFrame to which the filtered values (interactions) will be added.
+            threshold (float): A treshold value that is used for filtering of the values (interactions) based upon the occurence count.
+            df (pandas dataframe): DataFrame to which the filtered values (interactions) will be added.
+            unique_columns_rings_grouped (dict): Dictionary containing the grouped and unique values otained from gather_interactions.
 
         Returns:
-            list: A list of values, with unique values and their corresponding occurrence counts.
+            list: A list of values, with unique values and their corresponding occurence counts.
         """
+
         frames = len(self.pdb_md.trajectory) - 1
+        # Call the function to remove duplicate keys
         unique_data = self.remove_duplicate_values(self.unique_columns_rings_grouped)
+
+        # Call the function to combine sub-dictionary values
         unique_colums_rings_all = self.combine_subdict_values(unique_data)
+
+        # Flatten the list of values
         all_values = unique_colums_rings_all["all"]
 
+        # Count the occurrences of each value
         occurrences = {}
         for value in all_values:
-            occurrences[value] = occurrences.get(value, 0) + 1
+            if value in occurrences:
+                occurrences[value] += 1
+            else:
+                occurrences[value] = 1
 
+        # Calculate the threshold (20% of 1000)
         threshold = threshold * frames
+
+        # Filter out values that appear in less than 20% of 1000
         filtered_values = [
             value for value, count in occurrences.items() if count >= threshold
         ]
 
+        # Append the filtered values as columns to the DataFrame
         for value in filtered_values:
             df[value] = None
 
         return filtered_values
 
-    def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]:
-        """Generate a dictionary containing the unique interactions from a list of filtered values obtained by filtering_values.
+    def unique_data_generation(self, filtered_values):
+        """Generate a dictionary conataing the unique interactions from a list of filtered values obtained by filtering_values.
 
         Args:
             filtered_values (list): A list of values, where the unique interactions are extracted from.
@@ -267,21 +348,25 @@ def unique_data_generation(self, filtered_values: List[str]) -> Dict[str, str]:
         Returns:
             dict: A dictionary containing the filtered unique interactions.
         """
+        # Create a new dictionary to store unique values
         unique_data = {}
+
+        # Loop through the elements of the data_list
         for element in filtered_values:
+            # Check if the element is not already in the unique_data dictionary keys
             if element not in unique_data:
+                # If not, add the element as both key and value
                 unique_data[element] = element
 
         return unique_data
 
-    def df_iteration_numbering(
-        self, df: pd.DataFrame, unique_data: Dict[str, str]
-    ) -> None:
+    def df_iteration_numbering(self, df, unique_data):
         """Loop through the DataFrame and assign the values 1 and 0 to the rows, depending if the corresponding interaction from unique data is present.
 
         Args:
-            df (pd.DataFrame): DataFrame which has the interaction data for all of the frames.
-            unique_data (Dict[str, str]): Dictionary that contains the unique interactions obtained from unique_data_generation.
+            df (pandas dataframe): DataFrame which has the interaction data for all of the frames.
+            unique_data (dict): Dictionary that contains the unique interactions obtained from unique_data_generation.
+            peptide (str, optional): name of the peptide chainid in the original topology. Defaults to None.
         """
         if self.peptide is None:
             for index, row in df.iterrows():
@@ -297,11 +382,13 @@ def df_iteration_numbering(
                                     row["INTERACTION"] == interaction
                                 )
                                 df.at[index, col] = 1 if condition else 0
+                            else:
+                                continue
                 elif row["INTERACTION"] == "hbond":
                     if row["PROTISDON"] == True:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 ligcarbonidx = int(ligcarbonidx)
@@ -314,7 +401,7 @@ def df_iteration_numbering(
                     elif row["PROTISDON"] == False:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 ligcarbonidx = int(ligcarbonidx)
@@ -351,8 +438,8 @@ def df_iteration_numbering(
                 elif row["INTERACTION"] == "waterbridge":
                     for col in unique_data.values():
                         if "waterbridge" in col:
-                            if row["PROTISDON"]:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                            if row["PROTISDON"] == True:
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -361,8 +448,8 @@ def df_iteration_numbering(
                                     & (row["INTERACTION"] == interaction)
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                            else:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                            elif row["PROTISDON"] == False:
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -395,7 +482,7 @@ def df_iteration_numbering(
                             ligidx = parts[1:-3]
                             ligidx = ",".join(ligidx)
                             lig_group = parts[-3]
-                            _type = parts[-2]
+                            type = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
@@ -419,8 +506,7 @@ def df_iteration_numbering(
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
-
-        else:  # If self.peptide is not None
+        if self.peptide is not None:
             for index, row in df.iterrows():
                 if row["INTERACTION"] == "hydrophobic":
                     for col in unique_data.values():
@@ -434,11 +520,13 @@ def df_iteration_numbering(
                                     row["INTERACTION"] == interaction
                                 )
                                 df.at[index, col] = 1 if condition else 0
+                            else:
+                                continue
                 elif row["INTERACTION"] == "hbond":
-                    if row["PROTISDON"]:
+                    if row["PROTISDON"] == True:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -450,10 +538,10 @@ def df_iteration_numbering(
                                     & (row["INTERACTION"] == interaction)
                                 )
                                 df.at[index, col] = 1 if condition else 0
-                    else:
+                    elif row["PROTISDON"] == False:
                         for col in unique_data.values():
                             if "hbond" in col:
-                                prot_partner, ligcarbonidx, _type, interaction = (
+                                prot_partner, ligcarbonidx, type, interaction = (
                                     col.split("_")
                                 )
                                 condition = (
@@ -497,31 +585,46 @@ def df_iteration_numbering(
                 elif row["INTERACTION"] == "waterbridge":
                     for col in unique_data.values():
                         if "waterbridge" in col:
-                            prot_partner, ligcarbonidx, _type, interaction = col.split(
-                                "_"
-                            )
-                            condition = (
-                                (row["Prot_partner"] == prot_partner)
-                                & (
-                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
-                                    == ligcarbonidx
+                            if row["PROTISDON"] == True:
+                                prot_partner, ligcarbonidx, type, interaction = (
+                                    col.split("_")
                                 )
-                                & (row["INTERACTION"] == interaction)
-                            )
-                            df.at[index, col] = 1 if condition else 0
+                                condition = (
+                                    (row["Prot_partner"] == prot_partner)
+                                    & (
+                                        (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                        == ligcarbonidx
+                                    )
+                                    & (row["INTERACTION"] == interaction)
+                                )
+                                df.at[index, col] = 1 if condition else 0
+                            elif row["PROTISDON"] == False:
+                                prot_partner, ligcarbonidx, type, interaction = (
+                                    col.split("_")
+                                )
+                                condition = (
+                                    (row["Prot_partner"] == prot_partner)
+                                    & (
+                                        (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                        == ligcarbonidx
+                                    )
+                                    & (row["INTERACTION"] == interaction)
+                                )
+                                df.at[index, col] = 1 if condition else 0
                 elif row["INTERACTION"] == "pication":
                     for col in unique_data.values():
                         if "pication" in col:
                             parts = col.split("_")
                             prot_partner = parts[0]
-                            ligidx = parts[1:-2]
-                            ligidx = ",".join(ligidx)
+                            ligidx = parts[1]
                             ligtype = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
-                                & (row["RESNR_LIG"] == ligidx)
-                                & (row["RESTYPE_LIG"] == ligtype)
+                                & (
+                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                    == ligidx
+                                )
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
@@ -530,15 +633,16 @@ def df_iteration_numbering(
                         if "saltbridge" in col:
                             parts = col.split("_")
                             prot_partner = parts[0]
-                            ligidx = parts[1:-3]
-                            ligidx = ",".join(ligidx)
+                            ligidx = parts[1]
                             lig_group = parts[-3]
-                            _type = parts[-2]
+                            type = parts[-2]
                             interaction = parts[-1]
                             condition = (
                                 (row["Prot_partner"] == prot_partner)
-                                & (row["RESNR_LIG"] == ligidx)
-                                & (row["RESTYPE_LIG"] == lig_group)
+                                & (
+                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                    == ligidx
+                                )
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
@@ -547,26 +651,27 @@ def df_iteration_numbering(
                         if "metal" in col:
                             parts = col.split("_")
                             special_ligand = parts[0]
-                            ligidx = int(parts[1])
+                            ligidx = parts[1]
                             metal_type = parts[2]
                             interaction = parts[-1]
                             condition = (
                                 (row["RESTYPE_LIG"] == special_ligand)
-                                & (int(row["TARGET_IDX"]) == ligidx)
+                                & (
+                                    (str(row["RESNR_LIG"]) + row["RESTYPE_LIG"])
+                                    == ligidx
+                                )
                                 & (row["METAL_TYPE"] == metal_type)
                                 & (row["INTERACTION"] == interaction)
                             )
                             df.at[index, col] = 1 if condition else 0
 
-    def update_values(
-        self, df: pd.DataFrame, new: pd.DataFrame, unique_data: Dict[str, str]
-    ) -> None:
-        """Update the values in the input DataFrame based upon the frame values and a reference DataFrame.
+    def update_values(self, df, new, unique_data):
+        """Update the values in the input DataFrame based upon the frame values and an reference DataFrame.
 
         Args:
-        df (pandas.DataFrame): Input DataFrame that will be updated.
-        new (pandas.DataFrame): The reference DataFrame containing values that are used to update the input DataFrame.
-        unique_data (Dict[str, str]): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame.
+            df (pandas dataframe): Input DataFrame that will be updated.
+            new (pandas dataframe): The reference DataFrame containing values that are used to update the input DataFrame.
+            unique_data (dict): A dictionary containing keys that represent the specific unique column names that need to be updated in the input DataFrame.
         """
         for idx, row in df.iterrows():
             frame_value = row["FRAME"]

From 17c4eca7ce2a0e036d3ae10c579aa1ce556ce853 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Fri, 25 Oct 2024 19:40:40 +0200
Subject: [PATCH 15/23] Update rmsd_calculation.py

adjusted code, removed duplicate mention of frames present from the earlier code
---
 .../openmmdl_analysis/rmsd_calculation.py     | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/rmsd_calculation.py b/openmmdl/openmmdl_analysis/rmsd_calculation.py
index 4715fbc1..65d64f76 100644
--- a/openmmdl/openmmdl_analysis/rmsd_calculation.py
+++ b/openmmdl/openmmdl_analysis/rmsd_calculation.py
@@ -50,7 +50,7 @@ def rmsd_for_atomgroups(
             self.universe, ref, select=selection1, groupselections=selection2
         )
         rmsd_analysis.run()
-        columns = [selection1] + selection2 if selection2 else [selection1]
+        columns = [selection1, *selection2] if selection2 else [selection1]
         rmsd_df = pd.DataFrame(np.round(rmsd_analysis.rmsd[:, 2:], 2), columns=columns)
         rmsd_df.index.name = "frame"
 
@@ -123,7 +123,7 @@ def rmsd_dist_frames(
             img1, ax=ax, orientation="horizontal", fraction=0.1, label="RMSD (Å)"
         )
 
-        plt.savefig(f"./RMSD/RMSD_between_the_frames.{fig_type}")
+        plt.savefig(f"{output_directory}/RMSD_between_the_frames.{fig_type}")
         return pairwise_rmsd_prot, pairwise_rmsd_lig
 
     def calc_rmsd_2frames(self, ref: np.ndarray, frame: np.ndarray) -> float:
@@ -152,25 +152,24 @@ def calculate_distance_matrix(self, selection: str) -> np.ndarray:
         return distances
 
     def calculate_representative_frame(
-        self, bmode_frames: List[int], DM: np.ndarray
+        self, binding_mode_frames: List[int], distance_matrix: np.ndarray
     ) -> int:
         """Calculates the most representative frame for a binding mode. This is based upon the average RMSD of a frame to all other frames in the binding mode.
 
         Args:
-            bmode_frames (list): List of frames belonging to a binding mode.
-            DM (np.array): Distance matrix of trajectory.
+            binding_mode_frames (list): List of frames belonging to a binding mode.
+            distance_matrix (np.array): Distance matrix of trajectory.
 
         Returns:
             int: Number of the most representative frame.
         """
-        frames: List[int] = bmode_frames
         mean_rmsd_per_frame: dict = {}
-        for frame_i in frames:
+        for frame_i in binding_mode_frames:
             mean_rmsd_per_frame[frame_i] = 0
-            for frame_j in frames:
+            for frame_j in binding_mode_frames:
                 if frame_j != frame_i:
-                    mean_rmsd_per_frame[frame_i] += DM[frame_i - 1, frame_j - 1]
-            mean_rmsd_per_frame[frame_i] /= len(frames)
+                    mean_rmsd_per_frame[frame_i] += distance_matrix[frame_i - 1, frame_j - 1]
+            mean_rmsd_per_frame[frame_i] /= len(binding_mode_frames)
 
         repre: int = min(mean_rmsd_per_frame, key=mean_rmsd_per_frame.get)
 

From 1f56384563909d7f0ab1948f3fcd0e563605bf32 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Fri, 25 Oct 2024 19:52:08 +0200
Subject: [PATCH 16/23] Update visualization_functions.py

looks good, removed the NV.Widget, since I am not sure if it is the correct object and the type is not that important here what nglview object it is
---
 openmmdl/openmmdl_analysis/visualization_functions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/visualization_functions.py b/openmmdl/openmmdl_analysis/visualization_functions.py
index 37079ef2..8f935f7b 100644
--- a/openmmdl/openmmdl_analysis/visualization_functions.py
+++ b/openmmdl/openmmdl_analysis/visualization_functions.py
@@ -14,7 +14,7 @@ class TrajectorySaver:
     def __init__(
         self, pdb_md: mda.Universe, ligname: str, special: str, nucleic: bool
     ) -> None:
-        """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name, and receptor type.
+        """Initializes the TrajectorySaver with an mda.Universe object, ligand name, special residue name and receptor type.
 
         Args:
             pdb_md (mda.Universe): MDAnalysis Universe object containing the trajectory.
@@ -30,7 +30,7 @@ def __init__(
     def save_interacting_waters_trajectory(
         self, interacting_waters: List[int], outputpath: str = "./Visualization/"
     ) -> None:
-        """Saves .pdb and .dcd files of the trajectory containing ligand, receptor, and all interacting waters.
+        """Saves .pdb and .dcd files of the trajectory containing ligand, receptor and all interacting waters.
 
         Args:
             interacting_waters (List[int]): List of all interacting water IDs.
@@ -103,7 +103,7 @@ def visualize(
         receptor_type: str = "protein or nucleic",
         height: str = "1000px",
         width: str = "1000px",
-    ) -> nv.NGLWidget:
+    ):
         """Generates visualization of the trajectory with the interacting waters and interaction clouds.
 
         Args:
@@ -112,7 +112,7 @@ def visualize(
             width (str, optional): Width of the visualization. Defaults to '1000px'.
 
         Returns:
-            nv.NGLWidget: Returns an nglview.widget object containing the visualization.
+            nglview widget: Returns an nglview.widget object containing the visualization.
         """
 
         sphere_buffers = []

From cff415ccbc84df00069e8353f8ddd9f1c988fdd5 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Fri, 25 Oct 2024 20:18:14 +0200
Subject: [PATCH 17/23] Update preprocessing.py

added the comments back in and adjusted small parts
---
 openmmdl/openmmdl_analysis/preprocessing.py | 52 +++++++++++++++++++--
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/preprocessing.py b/openmmdl/openmmdl_analysis/preprocessing.py
index d24c16b3..db79d43d 100644
--- a/openmmdl/openmmdl_analysis/preprocessing.py
+++ b/openmmdl/openmmdl_analysis/preprocessing.py
@@ -1,13 +1,13 @@
 import MDAnalysis as mda
+import mdtraj as md
 import os
 import re
 from typing import List, Optional
 import rdkit
 from rdkit import Chem
-from rdkit.Chem import AllChem
-from rdkit.Chem import Mol
+from rdkit.Chem import AllChem, Mol
 from openbabel import pybel
-import mdtraj as md
+
 
 
 class Preprocessing:
@@ -25,12 +25,16 @@ def renumber_protein_residues(
             reference_pdb (str): Path to the reference PDB file representing the molecular dynamics trajectory used as a reference.
             output_pdb (str): Path to the output PDB file where the renumbered trajectory will be saved.
         """
+
+        # Load trajectories
         traj_input = md.load(input_pdb)
         traj_reference = md.load(reference_pdb)
 
+        # Get the topology DataFrames
         input_top_df, _ = traj_input.topology.to_dataframe()
         ref_top_df, _ = traj_reference.topology.to_dataframe()
 
+        # List of common protein residue names including additional residues
         protein_residues = [
             "ALA",
             "ARG",
@@ -59,7 +63,9 @@ def renumber_protein_residues(
             "HIP",
         ]
 
+        # Iterate over each chain in the reference topology
         for chain_id in ref_top_df["chainID"].unique():
+            # Extract residue indices from the reference topology for the current chain
             ref_residue_indices = (
                 ref_top_df[
                     (ref_top_df["chainID"] == chain_id)
@@ -68,14 +74,20 @@ def renumber_protein_residues(
                 - 1
             )
 
+            # Update residue indices in the input topology DataFrame for the current chain
             mask = (input_top_df["chainID"] == chain_id) & input_top_df["resName"].isin(
                 protein_residues
             )
             input_top_df.loc[mask, "resSeq"] = ref_residue_indices + 1
 
+        # Create a new topology from the modified DataFrame
         new_top = md.Topology.from_dataframe(input_top_df)
+
+        # Update the topology in the input trajectory
         new_traj = traj_input.slice(range(traj_input.n_frames))
         new_traj.topology = new_top
+
+        # Save the renumbered trajectory to a new PDB file
         new_traj.save(output_pdb)
 
     def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]:
@@ -93,14 +105,18 @@ def increase_ring_indices(self, ring: List[int], lig_index: int) -> List[int]:
 
     def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None:
         """
-        Converts ligand structures from an SDF file to SMILES format.
+        Converts ligand structures from an SDF file to SMILES :) format.
 
         Args:
             input_sdf (str): Path to the SDF file with the ligand that will be converted.
             output_smi (str): Path to the output SMILES file.
         """
+        # Create a molecule supplier from an SDF file
         mol_supplier = Chem.SDMolSupplier(input_sdf)
+
+        # Open the output SMILES file for writing
         with open(output_smi, "w") as output_file:
+            # Iterate through molecules and convert each to SMILES
             for mol in mol_supplier:
                 if mol is not None:
                     smiles = Chem.MolToSmiles(mol)
@@ -110,18 +126,22 @@ def convert_ligand_to_smiles(self, input_sdf: str, output_smi: str) -> None:
 
     def process_pdb_file(self, input_pdb_filename: str) -> None:
         """
-        Process a PDB file to make it compatible with the openmmdl_analysis package.
+        Process a PDB file to make it compatible with OpenMMDL Analysis.
 
         Args:
             input_pdb_filename (str): Path to the input PDB file.
         """
+        # Load the PDB file
         u = mda.Universe(input_pdb_filename)
+        # Iterate through the topology to modify residue names
         for atom in u.atoms:
             resname = atom.resname
+            # Check if the residue name is one of the specified water names
             if resname in ["SPC", "TIP3", "TIP4", "WAT", "T3P", "T4P", "T5P"]:
                 atom.residue.resname = "HOH"
             elif resname == "*":
                 atom.residue.resname = "UNK"
+        # Save the modified topology to a new PDB file
         u.atoms.write(input_pdb_filename)
 
     def extract_and_save_ligand_as_sdf(
@@ -135,7 +155,10 @@ def extract_and_save_ligand_as_sdf(
             output_filename (str): Name of the output SDF file.
             target_resname (str): Residue name of the ligand in the original PDB file.
         """
+        # Load the PDB file using MDAnalysis
         u = mda.Universe(input_pdb_filename)
+
+        # Find the ligand by its residue name
         ligand_atoms = u.select_atoms(f"resname {target_resname}")
 
         if len(ligand_atoms) == 0:
@@ -144,11 +167,14 @@ def extract_and_save_ligand_as_sdf(
             )
             return
 
+        # Create a new Universe with only the ligand
         ligand_universe = mda.Merge(ligand_atoms)
         ligand_universe.atoms.write("lig.pdb")
 
+        # use openbabel to convert pdb to sdf
         mol = next(pybel.readfile("pdb", "lig.pdb"))
         mol.write("sdf", output_filename, overwrite=True)
+        # remove the temporary pdb file
         os.remove("lig.pdb")
 
     def renumber_atoms_in_residues(
@@ -162,6 +188,7 @@ def renumber_atoms_in_residues(
             output_pdb_file (str): Path to the output PDB file.
             lig_name (str): Name of the ligand in the input PDB file.
         """
+        # Read the input PDB file
         with open(input_pdb_file, "r") as f:
             pdb_lines = f.readlines()
 
@@ -170,21 +197,33 @@ def renumber_atoms_in_residues(
 
         for line in pdb_lines:
             if line.startswith("ATOM"):
+                # Extract information from the ATOM line
                 atom_serial = int(line[6:11])
                 atom_name = line[12:16].strip()
                 residue_name = line[17:20].strip()
 
+                # Check if the residue is residue_name
                 if residue_name == lig_name:
+                    
+                    # Extract the element from the atom name (assuming it starts with a capital letter)
                     element_match = re.match(r"([A-Z]+)", atom_name)
                     element = element_match.group(1) if element_match else atom_name
+                    
+                    # Increment the count for the current element in the current residue_name residue
                     lig_residue_elements[element] = (
                         lig_residue_elements.get(element, 0) + 1
                     )
+
+                    # Update the atom name based on the element count
                     new_atom_name = f"{element}{lig_residue_elements[element]}"
+
+                    # Replace the old atom name with the new one
                     line = line[:12] + f"{new_atom_name:4}" + line[16:]
 
+            # Append the line to the new PDB lines
             new_pdb_lines.append(line)
 
+        # Write the modified PDB lines to the output file
         with open(output_pdb_file, "w") as f:
             f.writelines(new_pdb_lines)
 
@@ -201,7 +240,10 @@ def replace_atom_type(self, data: str) -> str:
         lines = data.split("\n")
         for i, line in enumerate(lines):
             if " LIG  X" in line:
+                # Extract the last column which contains the atom type (O/N/H)
                 atom_type = line[12:13].strip()
+
+                # Replace 'X' with the correct atom type
                 lines[i] = line.replace(" LIG  X", f" LIG  {atom_type}")
         return "\n".join(lines)
 

From ac6ca37e681295b6d589b427794cfbfab79e0ddd Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Fri, 25 Oct 2024 20:43:03 +0200
Subject: [PATCH 18/23] Update barcode_generation.py

adjusted small typos
---
 openmmdl/openmmdl_analysis/barcode_generation.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/barcode_generation.py b/openmmdl/openmmdl_analysis/barcode_generation.py
index 8bf6c78b..931b3155 100644
--- a/openmmdl/openmmdl_analysis/barcode_generation.py
+++ b/openmmdl/openmmdl_analysis/barcode_generation.py
@@ -115,22 +115,13 @@ def interacting_water_ids(self, waterbridge_interactions: List[str]) -> List[int
         return list(set(interacting_waters))
 
 
-class BarcodeGenerator:
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-
-    def generate_waterids_barcode(self, interaction: str) -> List[Union[int, int]]:
-        # Placeholder implementation
-        return []
-
-
 class BarcodePlotter:
     def __init__(self, df_all: pd.DataFrame):
         """
         Initializes the BarcodePlotter with a dataframe and BarcodeGenerator instance.
 
         Args:
-            df_all (pd.DataFrame): Dataframe containing all interactions from plip analysis.
+            df_all (pd.DataFrame): Dataframe containing all interactions from PLIP analysis.
         """
         self.df_all = df_all
         self.barcode_gen = BarcodeGenerator(df_all)
@@ -195,7 +186,7 @@ def plot_waterbridge_piechart(
         Args:
             waterbridge_barcodes (Dict[str, np.ndarray]): Dictionary where keys are interaction names and values are binary barcode arrays.
             waterbridge_interactions (List[str]): List of water bridge interaction names.
-            fig_type (str): File extension for the saved figures (e.g., 'png', 'pdf').
+            fig_type (str): File extension for the saved figures (e.g., 'png', 'svg').
         """
         if not waterbridge_barcodes:
             print("No Piecharts to plot.")

From 4b5fcd4648a0c666d383e3173b4ebe93a17be449 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:21:35 +0100
Subject: [PATCH 19/23] Update find_stable_waters.py

adjusted and added the comments back in
---
 .../openmmdl_analysis/find_stable_waters.py   | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py
index 390d4fc1..fb75208b 100644
--- a/openmmdl/openmmdl_analysis/find_stable_waters.py
+++ b/openmmdl/openmmdl_analysis/find_stable_waters.py
@@ -24,13 +24,18 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
         Returns:
             Tuple[pd.DataFrame, int]: DataFrame containing stable water coordinates and total number of frames.
         """
+        # Get the total number of frames for the progress bar
         total_frames = len(self.u.trajectory)
+        
+        # Create an empty DataFrame to store stable water coordinates
         stable_waters = pd.DataFrame(
             columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"]
         )
 
+        # Initialize variables for the previous frame's data
         prev_frame_coords = {}
 
+        # Iterate through frames with tqdm for the progress bar
         for ts in tqdm(
             self.u.trajectory,
             total=total_frames,
@@ -39,6 +44,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
             frame_num = ts.frame
             frame_coords = {}
 
+            # Iterate through oxygen atoms of the specified water type
             for atom in self.u.select_atoms("resname HOH and name O"):
                 frame_coords[atom.index] = (
                     atom.position[0],
@@ -46,19 +52,26 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
                     atom.position[2],
                 )
 
+            # Check if it's not the first frame
             if frame_num > 0:
                 stable_coords = []
 
+                # Iterate through the oxygen atoms in the current frame
                 for atom_index, coords in frame_coords.items():
-                    prev_coords = prev_frame_coords.get(atom_index, coords)
+                    prev_coords = prev_frame_coords.get(
+                        atom_index, coords
+                    )  # Get previous coordinates or use current if not found
 
+                    # Calculate the distance between current and previous coordinates
                     distance = np.linalg.norm(np.array(coords) - np.array(prev_coords))
 
+                    # If the distance is less than 1 Angstrom, consider it a stable water
                     if distance < 1:
                         stable_coords.append(
                             (frame_num, atom_index, coords[0], coords[1], coords[2])
                         )
 
+                # Append stable water coordinates to the stable_waters DataFrame
                 if stable_coords:
                     stable_waters = pd.concat(
                         [
@@ -76,6 +89,7 @@ def trace_waters(self, output_directory: str) -> Tuple[pd.DataFrame, int]:
                         ]
                     )
 
+            # Update the previous frame's coordinates
             prev_frame_coords = frame_coords
 
         stable_waters.to_csv(
@@ -95,12 +109,16 @@ def perform_clustering_and_writing(
 
         Args:
             stable_waters (pd.DataFrame): DataFrame containing stable water coordinates.
-            cluster_eps (float): DBSCAN clustering epsilon parameter.
+            cluster_eps (float): DBSCAN clustering epsilon parameter. This is in Angstrom in this case
+                                 and defines which Water distances should be within one cluster.
             total_frames (int): Total number of frames.
             output_directory (str): Directory where output files will be saved.
         """
+        
+        # Feature extraction: XYZ coordinates
         X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]]
 
+        # List of percentages to iterate over
         percentage_values = [25, 50, 75, 90, 99]
 
         for percent in percentage_values:
@@ -152,6 +170,7 @@ def write_pdb_clusters_and_representatives(
                     pdb_lines.append(pdb_line)
                     atom_counter += 1
 
+                # Write the current cluster to a new PDB file
                 output_filename = os.path.join(
                     output_sub_directory, f"cluster_{label}.pdb"
                 )
@@ -161,6 +180,7 @@ def write_pdb_clusters_and_representatives(
 
                 pdb_file_counter += 1
 
+            # Write representative water molecules to a PDB file
             representative_waters = clustered_waters.groupby("Cluster_Label").mean()
             representative_waters.reset_index(inplace=True)
             representative_filename = os.path.join(
@@ -177,11 +197,15 @@ def stable_waters_pipeline(self, output_directory: str = "./stableWaters") -> No
         Args:
             output_directory (str, optional): Directory where output files will be saved. Default is "./stableWaters".
         """
+        # Load the PDB and DCD files
         output_directory += "_clusterEps_"
         strEps = str(self.water_eps).replace(".", "")
         output_directory += strEps
         os.makedirs(output_directory, exist_ok=True)
+        
+        # Create a stable waters list by calling the process_trajectory_and_cluster function
         stable_waters, total_frames = self.trace_waters(output_directory)
+        # Now call perform_clustering_and_writing with the returned values
         self.perform_clustering_and_writing(
             stable_waters, self.water_eps, total_frames, output_directory
         )
@@ -205,9 +229,11 @@ def filter_and_parse_pdb(self, protein_pdb: str) -> Structure:
                 )
             ]
 
+        # Convert the list of lines to a string buffer
         pdb_string = "".join(lines)
         pdb_buffer = StringIO(pdb_string)
 
+        # Now parse the filtered lines
         parser = PDBParser(QUIET=True)
         structure = parser.get_structure("protein", pdb_buffer)
 
@@ -219,7 +245,8 @@ def find_interacting_residues(
         representative_waters: pd.DataFrame,
         distance_threshold: float,
     ) -> Dict[int, List[Tuple[str, int]]]:
-        """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input.
+        """Maps waters (e.g. the representative waters) to interacting residues of a different PDB structure input. 
+           Use "filter_and_parse_pdb" to get the input for this function.
         Args:
             structure (Structure): Biopython PDB structure object.
             representative_waters (pd.DataFrame): DataFrame containing representative water coordinates.
@@ -232,6 +259,7 @@ def find_interacting_residues(
 
         for model in structure:
             for chain in model:
+                # Check if the residue is a protein residue (not a heteroatom or water molecule)
                 for residue in chain:
                     if (
                         residue.id[0] == " "
@@ -252,7 +280,7 @@ def find_interacting_residues(
 
                             distance = np.linalg.norm(wat_coords - residue_coords)
                             if distance < distance_threshold:
-                                key = wat_index
+                                key = wat_index # Assuming wat_index is the number of the water cluster
                                 if key not in interacting_residues:
                                     interacting_residues[key] = []
                                 interacting_residues[key].append(
@@ -306,9 +334,11 @@ def analyze_protein_and_water_interaction(
         strEps = str(cluster_eps).replace(".", "")
         output_directory += strEps
 
+        # Iterate over subdirectories
         for subdirectory in os.listdir(output_directory):
             subdirectory_path = os.path.join(output_directory, subdirectory)
             if os.path.isdir(subdirectory_path):
+                # Perform operations within each subdirectory
                 representative_waters = self.read_pdb_as_dataframe(
                     os.path.join(subdirectory_path, representative_waters_file)
                 )
@@ -320,6 +350,7 @@ def analyze_protein_and_water_interaction(
                     interacting_residues.items(),
                     columns=["Cluster_Number", "Interacting_Residues"],
                 )
+                # Save result to each subdirectory
                 result_df.to_csv(
                     os.path.join(subdirectory_path, "interacting_residues.csv"),
                     index=False,

From f5aa320f85bddce6b3d2e3a069e12af3d4b052f2 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:46:39 +0100
Subject: [PATCH 20/23] Update interaction_gathering.py

many modifications and removal of code lines, set it back to initial version
---
 .../interaction_gathering.py                  | 415 +++++++++++-------
 1 file changed, 260 insertions(+), 155 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/interaction_gathering.py b/openmmdl/openmmdl_analysis/interaction_gathering.py
index 08d0d8f3..aa56fbb0 100644
--- a/openmmdl/openmmdl_analysis/interaction_gathering.py
+++ b/openmmdl/openmmdl_analysis/interaction_gathering.py
@@ -3,44 +3,37 @@
 import MDAnalysis as mda
 from tqdm import tqdm
 from plip.basic import config
-from plip.structure.preparation import PDBComplex, PLInteraction
+from plip.structure.preparation import PDBComplex, LigandFinder, Mol, PLInteraction
 from plip.exchange.report import BindingSiteReport
 from multiprocessing import Pool
 from functools import partial
-from typing import List, Dict, Tuple, Optional, Union
 
 config.KEEPMOD = True
 
 
 class InteractionAnalyzer:
     def __init__(
-        self,
-        pdb_md: mda.Universe,
-        dataframe: Optional[str],
-        num_processes: int,
-        lig_name: str,
-        special_ligand: Optional[str],
-        peptide: Optional[str],
-    ) -> None:
+        self, pdb_md, dataframe, num_processes, lig_name, special_ligand, peptide
+    ):
         self.pdb_md = pdb_md
         self.dataframe = dataframe
         self.num_processes = num_processes
         self.lig_name = lig_name
         self.special = special_ligand
         self.peptide = peptide
-        self.interaction_list = self.process_trajectory()
+        self.ineraction_list = self.process_trajectory()
 
     def characterize_complex(
         self, pdb_file: str, binding_site_id: str
-    ) -> Optional[PLInteraction]:
-        """Characterize the protein-ligand complex and return their interaction set.
+    ) -> PLInteraction:
+        """Characterize the protein-ligand complex and return their interaction set
 
         Args:
-            pdb_file (str): Path to the PDB file.
-            binding_site_id (str): Identifier of the binding site.
+            pdb_file (str): A string, which represents the path to the PDB File
+            binding_site_id (str): A string that specifies the identifier of the binding site
 
         Returns:
-            Optional[PLInteraction]: Interactions if found; otherwise, None.
+            PLInteraction: A object representing the interactions if. If Binding site is not found returns None
         """
         pdb_complex = PDBComplex()
         pdb_complex.load_pdb(pdb_file)
@@ -50,30 +43,31 @@ def characterize_complex(
                 == binding_site_id
             ):
                 pdb_complex.characterize_complex(ligand)
-                return pdb_complex.interaction_sets[binding_site_id]
 
-        return None
+        return pdb_complex.interaction_sets[binding_site_id]
 
-    def retrieve_plip_interactions(
-        self, pdb_file: str, lig_name: str
-    ) -> Dict[str, Dict[str, List]]:
-        """Retrieve interactions from PLIP.
+    def retrieve_plip_interactions(self, pdb_file, lig_name):
+        """Retrieves the interactions from PLIP.
 
         Args:
-            pdb_file (str): Path to the PDB file of the complex.
-            lig_name (str): Name of the ligand in the complex topology.
+            pdb_file (str): The path of the PDB file of the complex.
+            lig_name (str): Name of the Ligand in the complex topology that will be analyzed.
 
         Returns:
-            Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions.
+            dict: A dictionary of the binding sites and the interactions.
         """
         protlig = PDBComplex()
-        protlig.load_pdb(pdb_file)
+        protlig.load_pdb(pdb_file)  # load the pdb file
         for ligand in protlig.ligands:
             if str(ligand.longname) == lig_name:
-                protlig.characterize_complex(ligand)
-        sites: Dict[str, Dict[str, List]] = {}
+                protlig.characterize_complex(
+                    ligand
+                )  # find ligands and analyze interactions
+        sites = {}
+        # loop over binding sites
         for key, site in sorted(protlig.interaction_sets.items()):
-            binding_site = BindingSiteReport(site)
+            binding_site = BindingSiteReport(site)  # collect data about interactions
+            # tuples of *_features and *_info will be converted to pandas DataFrame
             keys = (
                 "hydrophobic",
                 "hbond",
@@ -84,6 +78,9 @@ def retrieve_plip_interactions(
                 "halogen",
                 "metal",
             )
+            # interactions is a dictionary which contains relevant information for each
+            # of the possible interactions: hydrophobic, hbond, etc. in the considered
+            # binding site.
             interactions = {
                 k: [getattr(binding_site, k + "_features")]
                 + getattr(binding_site, k + "_info")
@@ -93,23 +90,26 @@ def retrieve_plip_interactions(
 
         return sites
 
-    def retrieve_plip_interactions_peptide(
-        self, pdb_file: str
-    ) -> Dict[str, Dict[str, List]]:
-        """Retrieve interactions from PLIP for a peptide.
+    def retrieve_plip_interactions_peptide(self, pdb_file):
+        """Retrives the interactions from PLIP for a peptide.
 
         Args:
-            pdb_file (str): Path to the PDB file of the complex.
+            pdb_file (str): The path of the PDB file of the complex.
+            peptide (str): Chainid of the peptide that will be analyzed.
 
         Returns:
-            Dict[str, Dict[str, List]]: Dictionary of binding sites and interactions.
+            dict: A dictionary of the binding sites and the interactions.
         """
         protlig = PDBComplex()
-        protlig.load_pdb(pdb_file)
-        protlig.characterize_complex(protlig.ligands[-1])
-        sites: Dict[str, Dict[str, List]] = {}
+        protlig.load_pdb(pdb_file)  # load the pdb file
+        protlig.characterize_complex(
+            protlig.ligands[-1]
+        )  # find ligands and analyze interactions
+        sites = {}
+        # loop over binding sites
         for key, site in sorted(protlig.interaction_sets.items()):
-            binding_site = BindingSiteReport(site)
+            binding_site = BindingSiteReport(site)  # collect data about interactions
+            # tuples of *_features and *_info will be converted to pandas DataFrame
             keys = (
                 "hydrophobic",
                 "hbond",
@@ -120,6 +120,9 @@ def retrieve_plip_interactions_peptide(
                 "halogen",
                 "metal",
             )
+            # interactions is a dictionary which contains relevant information for each
+            # of the possible interactions: hydrophobic, hbond, etc. in the considered
+            # binding site.
             interactions = {
                 k: [getattr(binding_site, k + "_features")]
                 + getattr(binding_site, k + "_info")
@@ -130,19 +133,18 @@ def retrieve_plip_interactions_peptide(
         return sites
 
     def create_df_from_binding_site(
-        self,
-        selected_site_interactions: Dict[str, List],
-        interaction_type: str = "hbond",
-    ) -> pd.DataFrame:
-        """Create a DataFrame from a binding site and interaction type.
+        self, selected_site_interactions, interaction_type="hbond"
+    ):
+        """Creates a data frame from a binding site and interaction type.
 
         Args:
-            selected_site_interactions (Dict[str, List]): Precalculated interactions from PLIP for the selected site.
-            interaction_type (str, optional): Interaction type of interest (default is "hbond").
+            selected_site_interactions (dict): Precaluclated interactions from PLIP for the selected site
+            interaction_type (str, optional): The interaction type of interest (default set to hydrogen bond). Defaults to "hbond".
 
         Returns:
-            pd.DataFrame: DataFrame with information from PLIP.
+            pandas dataframe: DataFrame with information retrieved from PLIP.
         """
+        # check if interaction type is valid:
         valid_types = [
             "hydrophobic",
             "hbond",
@@ -161,17 +163,20 @@ def create_df_from_binding_site(
             interaction_type = "hbond"
 
         df = pd.DataFrame.from_records(
+            # data is stored AFTER the column names
             selected_site_interactions[interaction_type][1:],
+            # column names are always the first element
             columns=selected_site_interactions[interaction_type][0],
         )
         return df
 
-    def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None:
-        """Reformat the topology file to change the ligand to a residue.
+    def change_lig_to_residue(self, file_path, new_residue_name):
+        """Reformats the topology file to change the ligand to a residue. This is needed for interactions with special ligands such as metal ions.
 
         Args:
-            file_path (str): Path to the topology file.
-            new_residue_name (str): New residue name.
+            file_path (str): Filepath of the topology file.
+            old_residue_name (str): Residue name of the ligand.
+            new_residue_name (str): New residue name of the ligand now changed to mimic an amino acid residue.
         """
         with open(file_path, "r") as file:
             lines = file.readlines()
@@ -179,10 +184,14 @@ def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None:
         with open(file_path, "w") as file:
             for line in lines:
                 if line.startswith("HETATM") or line.startswith("ATOM"):
+                    # Assuming the standard PDB format for simplicity
+                    # You may need to adapt this part based on your specific PDB file
                     atom_name = line[12:16].strip()
                     residue_name = line[17:20].strip()
 
+                    # Check if the residue name matches the one to be changed
                     if residue_name == self.lig_name:
+                        # Change the residue name to the new one
                         modified_line = line[:17] + new_residue_name + line[20:]
                         file.write(modified_line)
                     else:
@@ -190,65 +199,24 @@ def change_lig_to_residue(self, file_path: str, new_residue_name: str) -> None:
                 else:
                     file.write(line)
 
-    def process_frame_special(self, frame: int) -> pd.DataFrame:
-        """Process a single frame of MD simulation for special ligands.
-
-        Args:
-            frame (int): Frame number to be processed.
-
-        Returns:
-            pd.DataFrame: DataFrame containing interaction data for the processed frame.
-        """
-        atoms_selected = self.pdb_md.select_atoms(
-            f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
-        )
-        for num in self.pdb_md.trajectory[(frame) : (frame + 1)]:
-            atoms_selected.write(f"processing_frame_{frame}.pdb")
-        interaction_list = pd.DataFrame()
-        interactions_by_site = self.retrieve_plip_interactions(
-            f"processing_frame_{frame}.pdb", self.lig_name
-        )
-        index_of_selected_site = -1
-        selected_site = list(interactions_by_site.keys())[index_of_selected_site]
-
-        interaction_types = [
-            "hydrophobic",
-            "hbond",
-            "waterbridge",
-            "saltbridge",
-            "pistacking",
-            "pication",
-            "halogen",
-            "metal",
-        ]
-
-        for interaction_type in interaction_types:
-            tmp_interaction = self.create_df_from_binding_site(
-                interactions_by_site[selected_site],
-                interaction_type=interaction_type,
-            )
-            tmp_interaction["frame"] = frame
-            tmp_interaction["interaction_type"] = interaction_type
-            interaction_list = pd.concat([interaction_list, tmp_interaction])
-
-        os.remove(f"processing_frame_{frame}.pdb")
-        return interaction_list
-
-    def process_frame(self, frame: int) -> pd.DataFrame:
+    def process_frame(self, frame):
         """Process a single frame of MD simulation.
 
         Args:
-            frame (int): Frame number to be processed.
+            frame (int): The number of the frame that will be processed.
+            pdb_md (mda universe): The MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed.
+            lig_name (str): Name of the ligand in the complex that will be analyzed.
+            special (str, optional): Name of the special ligand in the complex that will be analyzed. Defaults to None.
+            peptide (srt, optional): Chainid of the peptide that will be analyzed. Defaults to None.
 
         Returns:
-            pd.DataFrame: DataFrame containing interaction data for the processed frame.
+            pandas dataframe: A dataframe conatining the interaction data for the processed frame.
         """
         atoms_selected = self.pdb_md.select_atoms(
             f"protein or nucleic or resname {self.lig_name} or (resname HOH and around 10 resname {self.lig_name}) or resname {self.special}"
         )
         for num in self.pdb_md.trajectory[(frame) : (frame + 1)]:
             atoms_selected.write(f"processing_frame_{frame}.pdb")
-        interaction_list = pd.DataFrame()
         if self.peptide is None:
             interactions_by_site = self.retrieve_plip_interactions(
                 f"processing_frame_{frame}.pdb", self.lig_name
@@ -267,19 +235,68 @@ def process_frame(self, frame: int) -> pd.DataFrame:
                 "metal",
             ]
 
+            interaction_list = pd.DataFrame()
             for interaction_type in interaction_types:
                 tmp_interaction = self.create_df_from_binding_site(
                     interactions_by_site[selected_site],
                     interaction_type=interaction_type,
                 )
-                tmp_interaction["frame"] = frame
-                tmp_interaction["interaction_type"] = interaction_type
+                tmp_interaction["FRAME"] = int(frame)
+                tmp_interaction["INTERACTION"] = interaction_type
                 interaction_list = pd.concat([interaction_list, tmp_interaction])
-        else:
+            if os.path.exists(f"processing_frame_{frame}.pdb"):
+                os.remove(f"processing_frame_{frame}.pdb")
+
+            if self.special is not None:
+                combi_lig_special = mda.Universe("ligand_special.pdb")
+                complex = mda.Universe("complex.pdb")
+                complex_all = complex.select_atoms("all")
+                result = self.process_frame_special(frame)
+                results_df = pd.concat(result, ignore_index=True)
+                results_df = results_df[results_df["LOCATION"] == "protein.sidechain"]
+                results_df["RESTYPE"] = results_df["RESTYPE"].replace(
+                    ["HIS", "SER", "CYS"], self.lig_name
+                )
+                results_df["LOCATION"] = results_df["LOCATION"].replace(
+                    "protein.sidechain", "ligand"
+                )
+                updated_target_idx = []
+
+                for index, row in results_df.iterrows():
+                    ligand_special_int_nr = int(row["TARGET_IDX"])
+                    ligand_special_int_nr_atom = combi_lig_special.select_atoms(
+                        f"id {ligand_special_int_nr}"
+                    )
+                    for atom in ligand_special_int_nr_atom:
+                        atom_name = atom.name
+                        # Adjust atom_name based on the specified conditions
+                        if atom_name in ["N", "C", "O", "S"]:
+                            atom_name = f"{atom_name}1"
+                        else:
+                            # Assuming the format is a single letter followed by a number
+                            base_name, atom_number = atom_name[:-1], int(atom_name[-1])
+                            new_atom_number = atom_number + 1
+                            atom_name = f"{base_name}{new_atom_number}"
+                        for complex_atom in complex_all:
+                            complex_atom_name = complex_atom.name
+                            if atom_name == complex_atom_name:
+                                true_number = complex_atom.id
+                                break  # Exit the loop once a match is found
+                        updated_target_idx.append(true_number)
+
+                # Update 'TARGET_IDX' in interaction_list
+                results_df["TARGET_IDX"] = updated_target_idx
+                interaction_list["TARGET_IDX"] = interaction_list["TARGET_IDX"]
+
+                # Concatenate the updated results_df to interaction_list
+                interaction_list = pd.concat([interaction_list, results_df])
+        if self.peptide is not None:
             interactions_by_site = self.retrieve_plip_interactions_peptide(
                 f"processing_frame_{frame}.pdb"
             )
-            selected_site = list(interactions_by_site.keys())[0]
+            index_of_selected_site = -1
+            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
+
             interaction_types = [
                 "hydrophobic",
                 "hbond",
@@ -290,85 +307,173 @@ def process_frame(self, frame: int) -> pd.DataFrame:
                 "halogen",
                 "metal",
             ]
+
+            interaction_list = pd.DataFrame()
             for interaction_type in interaction_types:
                 tmp_interaction = self.create_df_from_binding_site(
                     interactions_by_site[selected_site],
                     interaction_type=interaction_type,
                 )
-                tmp_interaction["frame"] = frame
-                tmp_interaction["interaction_type"] = interaction_type
+                tmp_interaction["FRAME"] = int(frame)
+                tmp_interaction["INTERACTION"] = interaction_type
                 interaction_list = pd.concat([interaction_list, tmp_interaction])
+            if os.path.exists(f"processing_frame_{frame}.pdb"):
+                os.remove(f"processing_frame_{frame}.pdb")
 
-        os.remove(f"processing_frame_{frame}.pdb")
         return interaction_list
 
-    def process_frame_wrapper(self, frame: int) -> pd.DataFrame:
-        """Wrapper function to handle processing of a frame, including special cases.
+    def process_frame_special(self, frame):
+        """Function extension of process_frame to process special ligands.
 
         Args:
-            frame (int): Frame number to be processed.
+            frame (int): Number of the frame that will be processed.
+            pdb_md (mda universe): MDA Universe class representation of the topology and the trajectory of the file that is being processed.
+            lig_name (str): Name of the ligand in the complex that will be analyzed.
+            special (str, optional): Name of the special ligand that will be analysed. Defaults to None.
 
         Returns:
-            pd.DataFrame: DataFrame containing interaction data for the processed frame.
+            list: list of dataframes containing the interaction data for the processed frame with the special ligand.
         """
-        if self.special:
-            return self.process_frame_special(frame)
-        return self.process_frame(frame)
+        res_renaming = ["HIS", "SER", "CYS"]
+        interaction_dfs = []
+        for res in res_renaming:
+            self.pdb_md.trajectory[frame]
+            atoms_selected = self.pdb_md.select_atoms(
+                f"resname {self.lig_name} or resname {self.special}"
+            )
+            atoms_selected.write(f"processing_frame_{frame}.pdb")
+            self.change_lig_to_residue(f"processing_frame_{frame}.pdb", res)
+            interactions_by_site = self.retrieve_plip_interactions(
+                f"processing_frame_{frame}.pdb", self.special
+            )
+            index_of_selected_site = -1
+            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
+            interaction_types = ["metal"]
+            interaction_list = pd.DataFrame()
+            for interaction_type in interaction_types:
+                tmp_interaction = self.create_df_from_binding_site(
+                    interactions_by_site[selected_site],
+                    interaction_type=interaction_type,
+                )
+                tmp_interaction["FRAME"] = int(frame)
+                tmp_interaction["INTERACTION"] = interaction_type
+                interaction_list = pd.concat([interaction_list, tmp_interaction])
+            interaction_dfs.append(interaction_list)
+            os.remove(f"processing_frame_{frame}.pdb")
+        return interaction_dfs
 
-    def fill_missing_frames(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Fill in missing frames in the DataFrame with NaNs.
+    def process_frame_wrapper(self, args):
+        """Wrapper for the MD Trajectory procession.
 
         Args:
-            df (pd.DataFrame): Original DataFrame with potentially missing frames.
+            args (tuple): Tuple containing (frame_idx: int - number of the frame to be processed, pdb_md: mda universe - MDA Universe class representation of the topology and the trajectory of the file that is being processed, lig_name: str - Name of the ligand in the complex that will be analyzed, special_ligand: str - Name of the special ligand that will be analysed, peptide: str - Chainid of the peptide that will be analyzed)
 
         Returns:
-            pd.DataFrame: DataFrame with missing frames filled.
+            tuple: tuple containing the frame index and the result of from the process_frame function.
         """
-        all_frames = set(range(len(self.pdb_md.trajectory)))
-        existing_frames = set(df["frame"].unique())
-        missing_frames = all_frames - existing_frames
-        missing_data = []
-
-        for frame in missing_frames:
-            for interaction_type in df["interaction_type"].unique():
-                missing_data.append(
-                    {
-                        "frame": frame,
-                        "interaction_type": interaction_type,
-                        # Add other necessary columns with NaNs or empty values as appropriate
-                    }
-                )
+        frame_idx, pdb_md, lig_name, special_ligand, peptide = args
+
+        return frame_idx, self.process_frame(frame_idx)
 
-        missing_df = pd.DataFrame(missing_data)
-        filled_df = pd.concat([df, missing_df], ignore_index=True)
-        return filled_df
+    def fill_missing_frames(self, df):
+        """Fills the frames with no interactions in the DataFrame with placeholder values.
 
-    def process_trajectory(self) -> pd.DataFrame:
-        """Main method for processing the trajectory.
+        Args:
+            df (pandas dataframe): The input DataFrame with frames that have no Interactions
+            md_len (int): The value that indicates the number of frames, thus allowing the function to loop through the DataFrame
 
         Returns:
-            pd.DataFrame: A DataFrame containing all interaction data for the trajectory.
+            pandas dataframe: DataFrame with placeholder values in the frames with no interactions.
         """
-        pool = Pool(processes=self.num_processes)
-        with tqdm(total=len(self.pdb_md.trajectory)) as pbar:
-            interaction_list = pd.concat(
-                list(
-                    tqdm(
-                        pool.imap(
-                            self.process_frame_wrapper,
-                            range(len(self.pdb_md.trajectory)),
-                        ),
-                        total=len(self.pdb_md.trajectory),
-                    )
+
+        md_len = len(self.pdb_md.trajectory) - 1
+        # Create a set containing all unique values in the 'FRAME' column
+        existing_frames = set(df["FRAME"])
+
+        # Create a list to store new rows for missing numbers
+        missing_rows = []
+
+        # Iterate through numbers from 0 to md_len
+        for frame_number in range(1, md_len):
+            if frame_number not in existing_frames:
+                # Create a new row with 'FRAME' set to the missing number and other columns set to "skip"
+                missing_row = {"FRAME": frame_number}
+                for col in df.columns:
+                    if col != "FRAME":
+                        missing_row[col] = "skip"
+                missing_rows.append(missing_row)
+
+        # Concatenate the missing rows with the original DataFrame
+        df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True)
+
+        # Sort the DataFrame by the 'FRAME' column
+        df.sort_values(by="FRAME", inplace=True)
+
+        return df
+
+    def process_trajectory(self):
+        """Process protein-ligand trajectory with multiple CPUs in parallel.
+
+        Args:
+            pdb_md (mda universe): MDAnalysis Universe class representation of the topology and the trajectory of the file that is being processed.
+            dataframe (str): Name of a CSV file as str, where the interaction data will be read from if not None.
+            num_processes (int): The number of CPUs that will be used for the processing of the protein-ligand trajectory. Defaults to half of the CPUs in the system.
+            lig_name (str): Name of the Ligand in the complex that will be analyzed.
+            special_ligand (str): Name of the special ligand in the complex that will be analyzed.
+            peptide (str): Chainid of the peptide that will be analyzed.
+
+        Returns:
+            pandas dataframe: A DataFrame containing all the protein-ligand interaction data from the whole trajectory.
+        """
+        if self.dataframe is None:
+            print("\033[1mProcessing protein-ligand trajectory\033[0m")
+            print(f"\033[1mUsing {self.num_processes} CPUs\033[0m")
+            total_frames = len(self.pdb_md.trajectory) - 1
+
+            with Pool(processes=self.num_processes) as pool:
+                frame_args = [
+                    (i, self.pdb_md, self.lig_name, self.special, self.peptide)
+                    for i in range(1, total_frames + 1)
+                ]
+
+                # Initialize the progress bar with the total number of frames
+                pbar = tqdm(
+                    total=total_frames,
+                    ascii=True,
+                    desc="\033[1mAnalyzing frames\033[0m",
                 )
-            )
-        pool.close()
-        pool.join()
 
-        if self.dataframe:
-            interaction_list.to_csv(self.dataframe)
+                results = []
+                for result in pool.imap(self.process_frame_wrapper, frame_args):
+                    results.append(result)
+                    pbar.update(1)  # Update the progress manually
+
+            # Close the progress bar
+            pbar.close()
+
+            # Extract the results and sort them by frame index
+            results.sort(key=lambda x: x[0])
+            interaction_lists = [result[1] for result in results]
+
+            interaction_list = pd.concat(interaction_lists)
+
+            interaction_list.to_csv("interactions_gathered.csv")
+
+        elif self.dataframe is not None:
+            print(f"\033[1mGathering data from {self.dataframe}\033[0m")
+            interaction_tmp = pd.read_csv(self.dataframe)
+            interaction_list = interaction_tmp.drop(interaction_tmp.columns[0], axis=1)
+
+        interaction_list["Prot_partner"] = (
+            interaction_list["RESNR"].astype(str)
+            + interaction_list["RESTYPE"]
+            + interaction_list["RESCHAIN"]
+        )
+
+        interaction_list = self.fill_missing_frames(
+            interaction_list,
+        )
 
-        # Ensure all frames are represented in the final DataFrame
-        interaction_list = self.fill_missing_frames(interaction_list)
+        print("\033[1mProtein-ligand trajectory processed\033[0m")
 
         return interaction_list

From 5c9603ce47ac801cddb9c10eef5a816c41697287 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 6 Nov 2024 16:54:26 +0100
Subject: [PATCH 21/23] Update pharmacophore.py

adjusted
---
 openmmdl/openmmdl_analysis/pharmacophore.py | 69 ++++++++++-----------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/pharmacophore.py b/openmmdl/openmmdl_analysis/pharmacophore.py
index 76d00263..d0f684ef 100644
--- a/openmmdl/openmmdl_analysis/pharmacophore.py
+++ b/openmmdl/openmmdl_analysis/pharmacophore.py
@@ -216,7 +216,7 @@ def generate_md_pharmacophore_cloudcenters(
             feature_type = feature_types[interaction]
             if interaction in ["Acceptor_hbond", "Donor_hbond"]:
                 pharm = self.generate_pharmacophore_vectors(
-                    self.df_all.filter(regex=interaction).columns.tolist()
+                    self.df_all.filter(regex=interaction).columns
                 )
                 for feature_name, position in pharm.items():
                     feature_id_counter += 1
@@ -273,7 +273,7 @@ def generate_md_pharmacophore_cloudcenters(
                         )
             elif interaction in ["hydrophobic", "PI_saltbridge", "NI_saltbridge"]:
                 pharm = self.generate_pharmacophore_centers(
-                    self.df_all.filter(regex=interaction).columns.tolist()
+                    self.df_all.filter(regex=interaction).columns
                 )
                 for feature_name, position in pharm.items():
                     feature_id_counter += 1
@@ -298,7 +298,7 @@ def generate_md_pharmacophore_cloudcenters(
                     )
             elif interaction == "pistacking":
                 pharm = self.generate_pharmacophore_vectors(
-                    self.df_all.filter(regex=interaction).columns.tolist()
+                    self.df_all.filter(regex=interaction).columns
                 )
                 feature_id_counter += 1
                 lig_loc = position[0]
@@ -375,22 +375,21 @@ def generate_bindingmode_pharmacophore(
             pharmacophoreType="LIGAND_SCOUT",
         )
 
-        for interaction, coords in dict_bindingmode.items():
-            # Get feature type
-            feature_type = next(
-                (ft for it, ft in feature_types.items() if it in interaction), None
-            )
-            if feature_type is None:
-                continue
+        for interaction in dict_bindingmode.keys():
+            # get feature type
+            for interactiontype in feature_types.keys():
+                if interactiontype in interaction:
+                    feature_type = feature_types[interactiontype]
+                    break
 
             # Generate vector features
             if feature_type in ["HBA", "HBD"]:
                 if feature_type == "HBA":
-                    orig_loc = coords["PROTCOO"][0]
-                    targ_loc = coords["LIGCOO"][0]
+                    orig_loc = dict_bindingmode[interaction]["PROTCOO"][0]
+                    targ_loc = dict_bindingmode[interaction]["LIGCOO"][0]
                 elif feature_type == "HBD":
-                    orig_loc = coords["LIGCOO"][0]
-                    targ_loc = coords["PROTCOO"][0]
+                    orig_loc = dict_bindingmode[interaction]["LIGCOO"][0]
+                    targ_loc = dict_bindingmode[interaction]["PROTCOO"][0]
                 feature_id_counter += 1
                 points_to_lig = "true" if feature_type == "HBA" else "false"
                 hasSyntheticProjectedPoint = "false"
@@ -404,7 +403,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.ligand_name,
+                    coreCompound=self.complex_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 origin = ET.SubElement(
@@ -425,7 +424,7 @@ def generate_bindingmode_pharmacophore(
                 )
             # Generate point features
             elif feature_type in ["H", "PI", "NI"]:
-                position = coords["LIGCOO"][0]
+                position = dict_bindingmode[interaction]["LIGCOO"][0]
                 feature_id_counter += 1
                 point = ET.SubElement(
                     pharmacophore,
@@ -435,7 +434,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.ligand_name,
+                    coreCompound=self.complex_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 position = ET.SubElement(
@@ -449,8 +448,8 @@ def generate_bindingmode_pharmacophore(
             # Generate plane features
             elif feature_type == "AR":
                 feature_id_counter += 1
-                lig_loc = coords["LIGCOO"][0]
-                prot_loc = coords["PROTCOO"][0]
+                lig_loc = dict_bindingmode[interaction]["LIGCOO"][0]
+                prot_loc = dict_bindingmode[interaction]["PROTCOO"][0]
 
                 # Normalize vector of plane
                 vector = np.array(lig_loc) - np.array(prot_loc)
@@ -465,7 +464,7 @@ def generate_bindingmode_pharmacophore(
                     optional="false",
                     disabled="false",
                     weight="1.0",
-                    coreCompound=self.ligand_name,
+                    coreCompound=self.complex_name,
                     id=f"feature{str(feature_id_counter)}",
                 )
                 position = ET.SubElement(
@@ -526,25 +525,25 @@ def generate_point_cloud_pml(self, outname: str) -> None:
         """
         cloud_dict = {}
         cloud_dict["H"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="hydrophobic").columns.tolist()
+            self.df_all.filter(regex="hydrophobic").columns
         )
         cloud_dict["HBA"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="Acceptor_hbond").columns.tolist()
+            self.df_all.filter(regex="Acceptor_hbond").columns
         )
         cloud_dict["HBD"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="Donor_hbond").columns.tolist()
+            self.df_all.filter(regex="Donor_hbond").columns
         )
         cloud_dict["AR"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="pistacking").columns.tolist()
+            self.df_all.filter(regex="pistacking").columns
         )
         cloud_dict["PI"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="PI_saltbridge").columns.tolist()
+            self.df_all.filter(regex="PI_saltbridge").columns
         )
         cloud_dict["NI"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="NI_saltbridge").columns.tolist()
+            self.df_all.filter(regex="NI_saltbridge").columns
         )
         cloud_dict["M"] = self.generate_pharmacophore_centers_all_points(
-            self.df_all.filter(regex="metal").columns.tolist()
+            self.df_all.filter(regex="metal").columns
         )
 
         pharmacophore = ET.Element(
@@ -554,9 +553,9 @@ def generate_point_cloud_pml(self, outname: str) -> None:
             pharmacophoreType="LIGAND_SCOUT",
         )
         feature_id_counter = 0
-        for feature_type, interactions in cloud_dict.items():
-            for interaction, points in interactions.items():
-                if len(points) > 1:
+        for feature_type in cloud_dict.keys():
+            for interaction in cloud_dict[feature_type].keys():
+                if len(cloud_dict[feature_type][interaction]) > 1:
                     feature_cloud = ET.SubElement(
                         pharmacophore,
                         "featureCloud",
@@ -570,12 +569,12 @@ def generate_point_cloud_pml(self, outname: str) -> None:
                     position = ET.SubElement(
                         feature_cloud,
                         "position",
-                        x3=str(points[0][0]),
-                        y3=str(points[0][1]),
-                        z3=str(points[0][2]),
+                        x3=str(cloud_dict[feature_type][interaction][0][0]),
+                        y3=str(cloud_dict[feature_type][interaction][0][1]),
+                        z3=str(cloud_dict[feature_type][interaction][0][2]),
                     )
-                    for additional_point in points[1:]:
-                        additional_point_element = ET.SubElement(
+                    for additional_point in cloud_dict[feature_type][interaction][1:]:
+                        additional_point = ET.SubElement(
                             feature_cloud,
                             "additionalPoint",
                             x3=str(round(additional_point[0], 2)),

From 0dc21e11061ef5a4aa9acd501831604e84a8078e Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 6 Nov 2024 16:57:49 +0100
Subject: [PATCH 22/23] Delete openmmdl/openmmdl_analysis/visualization.ipynb

---
 .../openmmdl_analysis/visualization.ipynb     | 94 -------------------
 1 file changed, 94 deletions(-)
 delete mode 100644 openmmdl/openmmdl_analysis/visualization.ipynb

diff --git a/openmmdl/openmmdl_analysis/visualization.ipynb b/openmmdl/openmmdl_analysis/visualization.ipynb
deleted file mode 100644
index ab4e7b09..00000000
--- a/openmmdl/openmmdl_analysis/visualization.ipynb
+++ /dev/null
@@ -1,94 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from openmmdl.openmmdl_analysis.visualization_functions import Visualizer\n",
-    "import MDAnalysis as mda"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ligname = \"UNK\"  # add the name of the ligand from the original PDB File\n",
-    "special = None  # if special ligand like HEM is present add the name of the special ligand from the original PDB File\n",
-    "md = mda.Universe(\"PATH_TO_interacting_waters.pdb\", \"PATH_TO_interacting_waters.dcd\")\n",
-    "cloud = \"PATH_TO_clouds.json\"\n",
-    "\n",
-    "visualizer = Visualizer(md, cloud, ligname, special)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "09acfdb0e59f46d481905cef43c231c0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "ThemeManager()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4b33c5fd6d6541ed8b9766bdbfe27e42",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "NGLWidget(gui_style='ngl', layout=Layout(height='1000px', width='1000px'), max_frame=9999)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Predefined variables: receptor_type=\"protein or nucleic\", height=\"1000px\", width=\"1000px\"\n",
-    "view = visualizer.visualize()\n",
-    "view.display(gui=True, style=\"ngl\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.10"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "e021570d17280ebc37ed8e9fc470ea25281789feeed8bd826cec564b6552ec2d"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From dc5b15464522b979a0c7bce2190fc14e34a1dfc1 Mon Sep 17 00:00:00 2001
From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com>
Date: Wed, 6 Nov 2024 17:09:56 +0100
Subject: [PATCH 23/23] Update rdkit_figure_generation.py

adjusted
---
 .../rdkit_figure_generation.py                | 202 ++++++++++--------
 1 file changed, 111 insertions(+), 91 deletions(-)

diff --git a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
index 694b0337..94860543 100644
--- a/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
+++ b/openmmdl/openmmdl_analysis/rdkit_figure_generation.py
@@ -186,93 +186,119 @@ def highlight_numbers(
             numeric_codes = parts[1:-1]
             interaction_type = parts[-1]
 
-            for code in numeric_codes:
-                atom_index = int(code)
-                complex_id = self.complex.select_atoms(f"id {atom_index}")
-                atom_name = complex_id[0].name if len(complex_id) > 0 else None
-
-                if atom_name is None:
-                    continue
-
-                lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                lig_real_index = lig_atom[0].id - 1 if len(lig_atom) > 0 else None
-
-                if lig_real_index is None:
-                    continue
-
-                if interaction_type == "hbond":
-                    type = parts[-2]
+            if interaction_type == "hbond":
+                parts = item.split()
+                protein_partner_name = parts[0]
+                numeric_codes = parts[1:-2]
+                type = parts[-2]
+                interaction_type = parts[-1]
+                for code in numeric_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
                     if type == "Donor":
-                        highlighted_hbond_donor.append(lig_real_index)
+                        highlighted_hbond_donor.append(lig_real_index - 1)
                     elif type == "Acceptor":
-                        highlighted_hbond_acceptor.append(lig_real_index)
-
-                elif interaction_type == "hydrophobic":
-                    highlighted_hydrophobic.append(lig_real_index)
-
-                elif interaction_type == "waterbridge":
-                    highlighted_waterbridge.append(lig_real_index)
-
-                elif interaction_type == "pistacking":
+                        highlighted_hbond_acceptor.append(lig_real_index - 1)
+
+            elif interaction_type == "hydrophobic":
+                for code in numeric_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
+                    highlighted_hydrophobic.append(lig_real_index - 1)
+
+            elif interaction_type == "waterbridge":
+                for code in numeric_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
+                    highlighted_waterbridge.append(lig_real_index - 1)
+
+            elif interaction_type == "pistacking":
+                split_codes = numeric_codes[0].split(",")
+                for code in split_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
+                    highlighted_pistacking.append(lig_real_index - 1)
+
+            elif interaction_type == "halogen":
+                numeric_codes = parts[1:-2]
+                for code in numeric_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
+                    highlighted_halogen.append(lig_real_index - 1)
+
+            elif interaction_type == "saltbridge":
+                numeric_codes = parts[1:-3]
+                saltbridge_type = parts[-2]
+                if saltbridge_type == "NI":
                     split_codes = numeric_codes[0].split(",")
                     for code in split_codes:
                         atom_index = int(code)
                         complex_id = self.complex.select_atoms(f"id {atom_index}")
-                        atom_name = complex_id[0].name if len(complex_id) > 0 else None
-                        if atom_name is None:
-                            continue
-                        lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                        lig_real_index = (
-                            lig_atom[0].id - 1 if len(lig_atom) > 0 else None
-                        )
-                        if lig_real_index is not None:
-                            highlighted_pistacking.append(lig_real_index)
-
-                elif interaction_type == "halogen":
-                    highlighted_halogen.append(lig_real_index)
-
-                elif interaction_type == "saltbridge":
-                    saltbridge_type = parts[-2]
-                    if saltbridge_type == "NI":
-                        split_codes = numeric_codes[0].split(",")
-                        for code in split_codes:
-                            atom_index = int(code)
-                            complex_id = self.complex.select_atoms(f"id {atom_index}")
-                            atom_name = (
-                                complex_id[0].name if len(complex_id) > 0 else None
-                            )
-                            if atom_name is None:
-                                continue
-                            lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                            lig_real_index = (
-                                lig_atom[0].id - 1 if len(lig_atom) > 0 else None
-                            )
-                            if lig_real_index is not None:
-                                highlighted_ni.append(lig_real_index)
-
-                    elif saltbridge_type == "PI":
-                        for code in numeric_codes:
-                            atom_index = int(code)
-                            complex_id = self.complex.select_atoms(f"id {atom_index}")
-                            atom_name = (
-                                complex_id[0].name if len(complex_id) > 0 else None
-                            )
-                            if atom_name is None:
-                                continue
-                            lig_atom = self.lig_noh.select_atoms(f"name {atom_name}")
-                            lig_real_index = (
-                                lig_atom[0].id - 1 if len(lig_atom) > 0 else None
-                            )
-                            if lig_real_index is not None:
-                                highlighted_pi.append(lig_real_index)
-
-                elif interaction_type == "pication":
-                    highlighted_pication.append(lig_real_index)
-
-                elif interaction_type == "metal":
-                    highlighted_metal.append(lig_real_index)
-
-        # Identify common indices between hbond_donor and hbond_acceptor
+                        for atom in complex_id:
+                            atom_name = atom.name
+                        for lig_atom in self.lig_noh:
+                            if lig_atom.name == atom_name:
+                                lig_real_index = lig_atom.id
+                        highlighted_ni.append(lig_real_index - 1)
+                elif saltbridge_type == "PI":
+                    for code in numeric_codes:
+                        atom_index = int(code)
+                        complex_id = self.complex.select_atoms(f"id {atom_index}")
+                        for atom in complex_id:
+                            atom_name = atom.name
+                        for lig_atom in self.lig_noh:
+                            if lig_atom.name == atom_name:
+                                lig_real_index = lig_atom.id
+                        highlighted_pi.append(lig_real_index - 1)
+
+            elif interaction_type == "pication":
+                numeric_codes = parts[1:-2]
+                for code in numeric_codes:
+                    atom_index = int(code)
+                    complex_id = self.complex.select_atoms(f"id {atom_index}")
+                    for atom in complex_id:
+                        atom_name = atom.name
+                    for lig_atom in self.lig_noh:
+                        if lig_atom.name == atom_name:
+                            lig_real_index = lig_atom.id
+                    highlighted_pication.append(lig_real_index - 1)
+
+            elif interaction_type == "metal":
+                ligidx = parts[1]
+                atom_index = int(ligidx)
+                complex_id = self.complex.select_atoms(f"id {atom_index}")
+                for atom in complex_id:
+                    atom_name = atom.name
+                for lig_atom in self.lig_noh:
+                    if lig_atom.name == atom_name:
+                        lig_real_index = lig_atom.id
+                highlighted_metal.append(lig_real_index - 1)
+
         for value in highlighted_hbond_donor[:]:
             if value in highlighted_hbond_acceptor:
                 highlighted_hbond_donor.remove(value)
@@ -337,15 +363,9 @@ def update_dict(
         """
         for source_dict in source_dicts:
             for key, value in source_dict.items():
-                if key in target_dict:
-                    # Combine the RGB values (optional approach; adjust as needed)
-                    existing_value = target_dict[key]
-                    target_dict[key] = tuple(
-                        min(1.0, max(0.0, (existing_value[i] + value[i]) / 2))
-                        for i in range(3)
-                    )
-                else:
-                    target_dict[key] = value
+                int_key = int(key)
+                if int_key not in target_dict:
+                    target_dict[int_key] = value
 
 
 class ImageMerger:
@@ -385,7 +405,7 @@ def create_and_merge_images(self) -> List[str]:
             label = data.split()[-1]
             type_ = data.split()[
                 -2
-            ]  # Renamed 'type' to 'type_' to avoid conflict with the built-in type() function
+            ]
             if label == "hydrophobic":
                 (line,) = ax.plot(
                     x, y, label=data, color=(1.0, 1.0, 0.0), linewidth=5.0