Issue #59/migrate sdrmsd to rdock_utils (#67)

* add numpy and openbabel as requirements * add files to ignore * refactor to python3 and add typing for superpose3D, squared_distance and rmsd functions * add typing for coordinates add and refactor map_to_crystal function * add parseArguments function with type annotations * update variable and function names add update_coordinates function with type annotations * add get_automorphism_RMSD function with type annotations * add save_molecule_with_RMSD function with type annotation * refactor superpose_3D function * update save_molecule_with_rmsd: remove unnecessary condition add main function * add aligned-fit sdf test file * add aligned-nofit sdf test file * add input sdf test file * add ref sdf test file * add conftest with fixtures folder path as constant * add sdrms conftest file * add aligned molecules raw data fixture * add aligned fit molecule raw data fixture * add integration tests * code formatting * add aligned_nofit_filename and aligned_fit_filename fixtures * add testing comparison functions for molecules: - atoms_are_equal - molecules_are_equal - compare_sdf_files * extract parser to its own file * add sdrmsd script package to pyproject.toml * update github actions workflow for rdock-utils - add ruff - remove flake, black, isort - add pytest * add dynamic dependencies to pyproject.toml * remove unnecessary docstrings * add old sdrmrsd to be execute as script * lint and typing
CBDD · Jan 27, 2024 · 5c7b9ea · 5c7b9ea
1 parent c14f0fd
commit 5c7b9ea
Show file tree

Hide file tree

Showing 21 changed files with 4,187 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,19 @@ build/linux-g++-64
 
 # test files
 *.as
-tests/results
+tests/results
+
+# rdock original files
+rdock-utils/scripts copy/make_grid.csh
+rdock-utils/scripts copy/rbhtfinder
+rdock-utils/scripts copy/run_rbdock.pl
+rdock-utils/scripts copy/run_rbscreen.pl
+rdock-utils/scripts copy/sdfield
+rdock-utils/scripts copy/sdfilter
+rdock-utils/scripts copy/sdmodify
+rdock-utils/scripts copy/sdreport
+rdock-utils/scripts copy/sdrmsd
+rdock-utils/scripts copy/sdsort
+rdock-utils/scripts copy/sdsplit
+rdock-utils/scripts copy/sdtether
+rdock-utils/scripts copy/to_unix
diff --git a/rdock-utils/Dockerfile b/rdock-utils/Dockerfile
@@ -1,5 +1,18 @@
 FROM python:3.12 AS base
 
+RUN DEBIAN_FRONTEND=noninteractive apt update -y \
+    && apt install -y --no-install-recommends \
+        swig \
+        openbabel \
+        libopenbabel-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# this is a bit of a mess but it is a quick way to be able to
+# pip install openbable, we will improve it later
+RUN ln -s /usr/include/openbabel3 /usr/local/include/openbabel3
+RUN ln -s /lib/x86_64-linux-gnu/libopenbabel.so /usr/local/lib/libopenbabel.so
+
+
 COPY requirements.txt requirements.txt
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt

diff --git a/rdock-utils/pyproject.toml b/rdock-utils/pyproject.toml
@@ -1,20 +1,27 @@
 [project]
+dynamic = ["dependencies", "optional-dependencies"]
 name = "rdock-utils"
 version = "0.1.0"
 description = "Utilities for working with RDock and operating on SD files"
 requires-python = ">=3.10.0"
 
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+optional-dependencies = { dev = {file = ["requirements-dev.txt"]} }
+
 [project.scripts]
 sdfield = "rdock_utils.sdfield:main"
 sdfilter = "rdock_utils.sdfilter:main"
+sdrmsd_old = "rdock_utils.sdrmsd_original:main"
+sdrmsd = "rdock_utils.sdrmsd.main:main"
 
 [project.urls]
 Repository = "https://github.com/CBDD/rDock.git"
 
 [tool.ruff]
 line-length = 119
 target-version = "py312"
-exclude = [".git","__pycache__"]
+exclude = [".git", "__pycache__", "rdock_utils/sdrmsd_original.py"]
 
 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F", "I"]
@@ -55,4 +62,4 @@ no_implicit_reexport = false
 
 strict_equality = true
 
-exclude = ['^setup\.py$']
+exclude = ["rdock_utils/sdrmsd_original.py", "tests/"]
diff --git a/rdock-utils/rdock_utils/sdnothing.py b/rdock-utils/rdock_utils/sdnothing.py
diff --git a/rdock-utils/rdock_utils/sdrmsd/__init__.py b/rdock-utils/rdock_utils/sdrmsd/__init__.py
diff --git a/rdock-utils/rdock_utils/sdrmsd/main.py b/rdock-utils/rdock_utils/sdrmsd/main.py
@@ -0,0 +1,13 @@
+from .parser import get_parser
+from .sdrmsd import SDRMSD
+
+
+def main(argv: list[str] | None = None) -> None:
+    parser = get_parser()
+    args = parser.parse_args(argv)
+    sdrmsd = SDRMSD(args.reference, args.input, args.out, args.fit, args.threshold)
+    sdrmsd.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rdock-utils/rdock_utils/sdrmsd/parser.py b/rdock-utils/rdock_utils/sdrmsd/parser.py
@@ -0,0 +1,54 @@
+import argparse
+
+
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="SDRMSD",
+        usage="%(prog)s [options] reference.sdf input.sdf",
+        description="Superpose molecules before RMSD calculation",
+        epilog=(
+            "Arguments:\n"
+            "   reference.sdf   SDF file with the reference molecule.\n"
+            "   input.sdf       SDF file with the molecules to be compared to reference.\n"
+        ),
+    )
+    parser.add_argument(
+        "reference",
+        type=str,
+        help="Path to the SDF file with the reference molecule.",
+    )
+    parser.add_argument(
+        "input",
+        type=str,
+        help="Path to the SDF file with the molecules to be compared to reference.",
+    )
+    parser.add_argument(
+        "-f",
+        "--fit",
+        action="store_true",
+        default=False,
+        help="Superpose molecules before RMSD calculation",
+    )
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        action="store",
+        default=None,
+        type=float,
+        help=(
+            "Discard poses with RMSD < THRESHOLD with respect previous poses "
+            "which were not rejected based on the same principle. A Population "
+            "SDField will be added to output SD with the population number."
+        ),
+    )
+    parser.add_argument(
+        "-o",
+        "--out",
+        default=None,
+        metavar="FILE",
+        help=(
+            "If declared, write an output SDF file with the input molecules with "
+            "a new sdfield <RMSD>. If the molecule was fitted, the fitted molecule coordinates will be saved."
+        ),
+    )
+    return parser
diff --git a/rdock-utils/rdock_utils/sdrmsd/sdrmsd.py b/rdock-utils/rdock_utils/sdrmsd/sdrmsd.py
@@ -0,0 +1,148 @@
+import functools
+import logging
+import math
+
+from openbabel import pybel
+
+from .superpose3d import MolAlignmentData, Superpose3D
+from .types import AutomorphismRMSD, CoordsArray, PoseMatchData, SDRMSDData
+
+logger = logging.getLogger("SDRMSD")
+
+
+class SDRMSD:
+    def __init__(
+        self,
+        reference_filename: str,
+        input_filename: str,
+        output_filename: str,
+        fit: bool = False,
+        threshold: float | None = None,
+    ) -> None:
+        self.reference_filename = reference_filename
+        self.input_filename = input_filename
+        self.fit = fit
+        self.output_filename = output_filename
+        self.threshold = threshold
+
+    def run(self) -> None:
+        # Find the RMSD between the crystal pose and each docked pose
+        docked_poses = pybel.readfile("sdf", self.input_filename)
+        self.display_fit_message()
+        data = SDRMSDData()
+
+        # Read crystal pose
+        crystal_pose = self.get_crystal_pose()
+        crystal_atoms = len(crystal_pose.atoms)
+
+        for i, docked_pose in enumerate(docked_poses, start=1):
+            atoms_number = self.process_docked_pose(docked_pose)
+
+            if atoms_number != crystal_atoms:
+                data.skipped.append(i)
+                continue
+
+            rmsd_result = self.calculate_rmsd(crystal_pose, docked_pose)
+            pose_match_data = PoseMatchData(i, docked_pose, data)
+            self.handle_pose_matching(rmsd_result, pose_match_data)
+
+        if self.output_filename:
+            self.process_and_save_selected_molecules(data)
+
+        if data.skipped:
+            logger.warning(f"SKIPPED input molecules due to the number of atom mismatch: {data.skipped}")
+
+    def get_automorphism_rmsd(self, target: pybel.Molecule, molecule: pybel.Molecule) -> AutomorphismRMSD:
+        superposer = Superpose3D(MolAlignmentData(molecule), MolAlignmentData(target))
+        result = superposer.automorphism_rmsd(self.fit)
+        return result
+
+    def update_coordinates(self, obmol: pybel.Molecule, new_coordinates: CoordsArray) -> None:
+        for i, atom in enumerate(obmol):
+            atom.OBAtom.SetVector(*new_coordinates[i])
+
+    def get_best_matching_pose(self, pose_match_data: PoseMatchData) -> tuple[int | None, float]:
+        threshold = self.threshold or math.inf
+        docked_pose = pose_match_data.docked_pose
+        molecules_dict = pose_match_data.sdrmsd_data.molecules_dict
+        get_rmsd = functools.partial(self.get_automorphism_rmsd, target=docked_pose)
+        poses_rmsd = ((index, get_rmsd(molecule)[0]) for index, molecule in molecules_dict.items())
+        filtered_by_threshold = (t for t in poses_rmsd if t[1] < threshold)
+        return min(filtered_by_threshold, key=lambda t: t[1], default=(None, math.inf))
+
+    def process_and_save_selected_molecules(self, data: SDRMSDData) -> None:
+        with pybel.Outputfile("sdf", self.output_filename, overwrite=True) as output_sdf:
+            for i in sorted(data.out_dict.keys()):
+                molecule, rmsd_result = data.out_dict[i]
+                # Get the number of matches in the thresholding operation
+                population_value = data.population.get(i, 1)
+                self.save_molecule_with_rmsd(output_sdf, molecule, rmsd_result, population_value)
+
+    def save_molecule_with_rmsd(
+        self, output_sdf: pybel.Outputfile, molecule: pybel.Molecule, rmsd: float, population_value: int
+    ) -> None:
+        new_data = pybel.ob.OBPairData()
+        new_data.SetAttribute("RMSD")
+        new_data.SetValue(f"{rmsd:.3f}")
+
+        if self.threshold:
+            pop_data = pybel.ob.OBPairData()
+            pop_data.SetAttribute("Population")
+            pop_data.SetValue(f"{population_value}")
+            molecule.OBMol.CloneData(pop_data)
+
+        molecule.OBMol.CloneData(new_data)
+        output_sdf.write(molecule)
+
+    def get_crystal_pose(self) -> pybel.Molecule:
+        crystal_pose = next(pybel.readfile("sdf", self.reference_filename))
+        crystal_pose.removeh()
+        return crystal_pose
+
+    def display_fit_message(self) -> None:
+        message = "FIT" if self.fit else "NOFIT"
+        print(f"POSE\tRMSD_{message}")
+
+    def process_docked_pose(self, docked_pose: pybel.Molecule) -> int:
+        docked_pose.removeh()
+        return len(docked_pose.atoms)
+
+    def calculate_rmsd(self, crystal: pybel.Molecule, docked_pose: pybel.Molecule) -> float:
+        rmsd, fitted_coords = self.get_automorphism_rmsd(crystal, docked_pose)
+
+        if self.fit:
+            if fitted_coords is not None:
+                self.update_coordinates(docked_pose, fitted_coords)
+            else:
+                logger.warning("Automorphism failed. skipping alignment")
+
+        return rmsd
+
+    def handle_pose_matching(self, rmsd: float, pose_match_data: PoseMatchData) -> None:
+        if self.threshold:
+            match_pose, best_match_value = self.get_best_matching_pose(pose_match_data)
+            if match_pose is not None:
+                self.print_matching_info(pose_match_data, match_pose, best_match_value)
+            else:
+                self.save_and_print_info(rmsd, pose_match_data)
+        else:
+            self.save_and_print_info(rmsd, pose_match_data)
+
+    def print_matching_info(self, pose_match_data: PoseMatchData, match_pose: int, best_match_value: float) -> None:
+        index = pose_match_data.pose_index
+        population = pose_match_data.sdrmsd_data.population
+        logger.info(f"Pose {index} matches pose {match_pose} with {best_match_value:.3f} RMSD")
+        population[match_pose] += 1
+
+    def save_and_print_info(self, rmsd: float, pose_match_data: PoseMatchData) -> None:
+        index = pose_match_data.pose_index
+        docked_pose = pose_match_data.docked_pose
+        out_dict = pose_match_data.sdrmsd_data.out_dict
+        molecules_dict = pose_match_data.sdrmsd_data.molecules_dict
+        population = pose_match_data.sdrmsd_data.population
+
+        if self.output_filename:
+            out_dict[index] = (docked_pose, rmsd)
+        print(f"{index}\t{rmsd:.2f}")
+        molecules_dict[index] = docked_pose
+        population[index] = 1