Skip to content

Commit

Permalink
feat: enhanced coconut preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
Kohulan committed Feb 20, 2024
1 parent 0b4c3ee commit 245a7d5
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 80 deletions.
281 changes: 281 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7e5f8f7b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[13:28:24] Initializing Normalizer\n"
]
}
],
"source": [
"from __future__ import annotations\n",
"\n",
"from chembl_structure_pipeline import checker, standardizer\n",
"from rdkit import Chem\n",
"\n",
"import app.modules.toolkits.cdk_wrapper as cdk\n",
"import app.modules.toolkits.rdkit_wrapper as rdkitmodules\n",
"from app.modules.coconut.descriptors import get_COCONUT_descriptors\n",
"from app.modules.toolkits.helpers import parse_input"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "92da069b",
"metadata": {},
"outputs": [],
"source": [
"def get_mol_block(input_text: str) -> str:\n",
" \"\"\"Generate a Molblock from input text using CDK.\n",
"\n",
" Args:\n",
" input_text (str): Input text (Mol/SMILES).\n",
"\n",
" Returns:\n",
" str: Molblock representation.\n",
"\n",
" Raises:\n",
" ValueError: If input_text is not a valid Mol or SMILES.\n",
" \"\"\"\n",
" check = rdkitmodules.is_valid_molecule(input_text)\n",
"\n",
" if check == \"smiles\":\n",
" molecule = parse_input(input_text, \"cdk\", False)\n",
" mol_block = cdk.get_CDK_SDG_mol(\n",
" molecule,\n",
" V3000=False,\n",
" ).replace(\"$$$$\\n\", \"\")\n",
" return mol_block\n",
" elif check == \"mol\":\n",
" return input_text\n",
" else:\n",
" return \"Error!, Check the input text.\"\n",
"\n",
"\n",
"def get_molecule_hash(molecule: any) -> dict:\n",
" \"\"\"Return various molecule hashes for the provided SMILES.\n",
"\n",
" Args:\n",
" smiles (str): Standardized SMILES string.\n",
"\n",
" Returns:\n",
" dict: Dictionary containing Formula, Isomeric SMILES, and Canonical SMILES.\n",
" \"\"\"\n",
" if molecule:\n",
" Formula = Chem.rdMolDescriptors.CalcMolFormula(molecule)\n",
" Isomeric_SMILES = Chem.MolToSmiles(molecule, kekuleSmiles=True)\n",
" Canonical_SMILES = Chem.MolToSmiles(\n",
" molecule,\n",
" kekuleSmiles=True,\n",
" isomericSmiles=False,\n",
" )\n",
" return {\n",
" \"Formula\": Formula,\n",
" \"Isomeric_SMILES\": Isomeric_SMILES,\n",
" \"Canonical_SMILES\": Canonical_SMILES,\n",
" }\n",
" else:\n",
" return {\"Error\": \"Check input SMILES\"}\n",
"\n",
"\n",
"def get_representations(molecule: any) -> dict:\n",
" \"\"\"Return COCONUT representations for the provided SMILES.\n",
"\n",
" Args:\n",
" smiles (str): SMILES string.\n",
"\n",
" Returns:\n",
" dict: Dictionary containing InChI, InChi Key, and Murko framework.\n",
" \"\"\"\n",
" if molecule:\n",
" InChI = Chem.inchi.MolToInchi(molecule)\n",
" InChI_Key = Chem.inchi.MolToInchiKey(molecule)\n",
" cdkMolecule = parse_input(Chem.MolToSmiles(molecule), \"cdk\", False)\n",
" Murko = cdk.get_murko_framework(cdkMolecule)\n",
" return {\n",
" \"standard_inchi\": InChI,\n",
" \"standard_inchikey\": InChI_Key,\n",
" \"murko_framework\": Murko,\n",
" }\n",
" else:\n",
" return {\"Error\": \"Check input SMILES\"}\n",
"\n",
"\n",
"def get_COCONUT_preprocessing(input_text: str) -> dict:\n",
" \"\"\"Preprocess user input text suitable for the COCONUT database submission.\n",
"\n",
" data.\n",
"\n",
" Args:\n",
" input_text (str): Input text (Mol/str).\n",
"\n",
" Returns:\n",
" dict: COCONUT preprocessed data.\n",
" \"\"\"\n",
" original_mol = parse_input(input_text, \"rdkit\", False)\n",
" try:\n",
" original_mol = parse_input(input_text, \"rdkit\", False)\n",
" if original_mol:\n",
" original_mol_block = get_mol_block(input_text)\n",
" original_mol_hash = get_molecule_hash(original_mol)\n",
" original_representations = get_representations(original_mol)\n",
" original_descriptors = get_COCONUT_descriptors(\n",
" input_text,\n",
" \"rdkit\",\n",
" )\n",
"\n",
" standarised_mol_block = standardizer.standardize_molblock(original_mol_block)\n",
" standardised_SMILES = Chem.MolToSmiles(\n",
" Chem.MolFromMolBlock(standarised_mol_block),\n",
" kekuleSmiles=True,\n",
" )\n",
"\n",
" standardised_mol = parse_input(standardised_SMILES, \"rdkit\", False)\n",
" standardised_molecule_hash = get_molecule_hash(standardised_mol)\n",
" standardised_representations = get_representations(standardised_mol)\n",
" standardised_descriptors = get_COCONUT_descriptors(\n",
" standardised_SMILES,\n",
" \"rdkit\",\n",
" )\n",
"\n",
" parent_canonical_smiles = original_mol_hash[\"Canonical_SMILES\"]\n",
" cdkParentMol = parse_input(parent_canonical_smiles, \"cdk\", False)\n",
" parent_2D_molblock = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=False).replace(\n",
" \"$$$$\\n\",\n",
" \"\",\n",
" )\n",
" parent_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=True).replace(\n",
" \"$$$$\\n\",\n",
" \"\",\n",
" )\n",
" rdkitParentMol = parse_input(parent_canonical_smiles, \"rdkit\", False)\n",
" parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol)\n",
"\n",
" parent_representations = get_representations(rdkitParentMol)\n",
" parent_descriptors = get_COCONUT_descriptors(\n",
" parent_canonical_smiles,\n",
" \"rdkit\",\n",
" )\n",
"\n",
" return {\n",
" \"original\": {\n",
" \"representations\": {\n",
" \"2D_MOL\": original_mol_block,\n",
" \"3D_MOL\": rdkitmodules.get_3d_conformers(original_mol),\n",
" \"cannonical_smiles\": original_mol_hash[\"Isomeric_SMILES\"],\n",
" \"standard_inchi\": original_representations[\"standard_inchi\"],\n",
" \"standard_inchikey\": original_representations[\"standard_inchikey\"],\n",
" \"murko_framework\": original_representations[\"murko_framework\"],\n",
" },\n",
" \"has_stereo\": rdkitmodules.has_stereochemistry(original_mol),\n",
" \"descriptors\": original_descriptors,\n",
" \"errors\": checker.check_molblock(original_mol_block),\n",
" },\n",
" \"standardized\": {\n",
" \"representations\": {\n",
" \"2D_MOL\": standarised_mol_block,\n",
" \"3D_MOL\": rdkitmodules.get_3d_conformers(standardised_mol),\n",
" \"cannonical_smiles\": standardised_SMILES,\n",
" \"standard_inchi\": standardised_representations[\"standard_inchi\"],\n",
" \"standard_inchikey\": standardised_representations[\"standard_inchikey\"],\n",
" \"murko_framework\": standardised_representations[\"murko_framework\"],\n",
" },\n",
" \"has_stereo\": rdkitmodules.has_stereochemistry(standardised_mol),\n",
" \"descriptors\": standardised_descriptors,\n",
" \"errors\": checker.check_molblock(standarised_mol_block),\n",
" },\n",
" \"parent\": {\n",
" \"representations\": {\n",
" \"2D_MOL\": parent_2D_molblock,\n",
" \"3D_MOL\": parent_3D_molblock,\n",
" \"cannonical_smiles\": parent_canonical_smiles,\n",
" \"standard_inchi\": parent_representations[\"standard_inchi\"],\n",
" \"standard_inchikey\": parent_representations[\"standard_inchikey\"],\n",
" \"murko_framework\": parent_representations[\"murko_framework\"],\n",
" },\n",
" \"has_stereo\": rdkitmodules.has_stereochemistry(rdkitParentMol),\n",
" \"descriptors\": parent_descriptors,\n",
" },\n",
" }\n",
" else:\n",
" return {\"Error\": \"Check input SMILES\"}\n",
" except InvalidInputException as e:\n",
" return {\"Error\": f\"Invalid input: {e}\"}\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a527283c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[13:35:20] SMILES Parse Error: syntax error while parsing: CC@C\n",
"[13:35:20] SMILES Parse Error: Failed parsing SMILES 'CC@C' for input: 'CC@C'\n"
]
},
{
"ename": "InvalidInputException",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mInvalidInputException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_SMILES\u001b[0;34m(smiles, framework, standardize)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mInvalidInputException\u001b[0m: ",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mInvalidInputException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_129565/2648845104.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_COCONUT_preprocessing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CC@C\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/tmp/ipykernel_129565/3304377404.py\u001b[0m in \u001b[0;36mget_COCONUT_preprocessing\u001b[0;34m(input_text)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mCOCONUT\u001b[0m \u001b[0mpreprocessed\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \"\"\"\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0moriginal_mol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_text\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rdkit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0moriginal_mol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_text\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rdkit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_input\u001b[0;34m(input, framework, standardize)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"SMILES\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparse_SMILES\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mframework\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstandardize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_SMILES\u001b[0;34m(smiles, framework, standardize)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mInvalidInputException\u001b[0m: "
]
}
],
"source": [
"get_COCONUT_preprocessing(\"CC@C\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a141d996",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 245a7d5

Please sign in to comment.