Skip to content

Commit

Permalink
fix: add canonicalisation to smiles generation
Browse files Browse the repository at this point in the history
  • Loading branch information
Kohulan committed Apr 23, 2024
1 parent af9d41f commit f5c2713
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 17 deletions.
10 changes: 2 additions & 8 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@
from prometheus_fastapi_instrumentator import Instrumentator

from .routers import chem
from .routers import converters
from .routers import depict
from .routers import ocsr
from .routers import tools

from app.exception_handlers import input_exception_handler
from app.exception_handlers import InvalidInputException
from app.schemas import HealthCheck
Expand All @@ -34,10 +31,7 @@
)

app.include_router(chem.router)
app.include_router(converters.router)
app.include_router(depict.router)
app.include_router(tools.router)
app.include_router(ocsr.router)


app = VersionedFastAPI(
app,
Expand Down
65 changes: 56 additions & 9 deletions app/modules/coconut/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,59 @@ def get_parent_smiles(molecule: Chem.Mol) -> str:
return "Error Check input SMILES"


def get_smiles(molecule: Chem.Mol, isomeric: bool = True) -> str:
"""
Retrieves the SMILES string (Isomeric or Canonical) for a given RDKit molecule object.
Args:
molecule (Chem.Mol): An RDKit molecule object representing the molecular structure.
isomeric (bool, optional): Whether to retrieve the Isomeric SMILES (True) or the Canonical SMILES (False).
Defaults to True.
Returns:
str: The Isomeric or Canonical SMILES string for the given molecule.
"""
if molecule:
initial_smiles = Chem.MolToSmiles(
molecule, isomericSmiles=isomeric, kekuleSmiles=True
)
canonical_mol = Chem.MolFromSmiles(Chem.CanonSmiles(initial_smiles))

if canonical_mol:
new_smiles = Chem.MolToSmiles(
canonical_mol, isomericSmiles=isomeric, kekuleSmiles=True
)
return new_smiles

return "Error Check input SMILES"


def get_standardized_smiles(standardized_mol_block: str) -> str:
"""
Get the standardized SMILES representation of a molecule.
This function takes a standardized molecular structure represented as a MolBlock and generates the corresponding
standardized SMILES representation.
Args:
standardized_mol_block (str): The standardized molecular structure in MolBlock format.
Returns:
str: The standardized SMILES representation of the molecule.
"""
standardized_smiles = Chem.MolToSmiles(
Chem.MolFromMolBlock(standardized_mol_block), kekuleSmiles=True
)
canonical_mol = Chem.MolFromSmiles(Chem.CanonSmiles(standardized_smiles))
if canonical_mol:
new_smiles = Chem.MolToSmiles(
canonical_mol, isomericSmiles=True, kekuleSmiles=True
)
return new_smiles

return "Error Check input SMILES"


def get_molecule_hash(molecule: Chem.Mol) -> dict:
"""Return various molecule hashes for the provided SMILES.
Expand All @@ -83,12 +136,8 @@ def get_molecule_hash(molecule: Chem.Mol) -> dict:
"""
if molecule:
Formula = Chem.rdMolDescriptors.CalcMolFormula(molecule)
Isomeric_SMILES = Chem.MolToSmiles(molecule, kekuleSmiles=True)
Canonical_SMILES = Chem.MolToSmiles(
molecule,
kekuleSmiles=True,
isomericSmiles=False,
)
Isomeric_SMILES = get_smiles(molecule, isomeric=True)
Canonical_SMILES = get_smiles(molecule, isomeric=False)
Parent_SMILES = get_parent_smiles(molecule)
return {
"Formula": Formula,
Expand Down Expand Up @@ -152,9 +201,7 @@ def get_COCONUT_preprocessing(

# Standardized molecule
standardized_mol_block = standardizer.standardize_molblock(original_mol_block)
standardized_SMILES = Chem.MolToSmiles(
Chem.MolFromMolBlock(standardized_mol_block), kekuleSmiles=True
)
standardized_SMILES = get_standardized_smiles(standardized_mol_block)
standardized_mol = parse_input(standardized_SMILES, "rdkit", False)
standardized_representations = get_representations(standardized_mol)

Expand Down

0 comments on commit f5c2713

Please sign in to comment.