-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: enhanced coconut preprocessing
- Loading branch information
Showing
2 changed files
with
340 additions
and
80 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,281 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "7e5f8f7b", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"[13:28:24] Initializing Normalizer\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from __future__ import annotations\n", | ||
"\n", | ||
"from chembl_structure_pipeline import checker, standardizer\n", | ||
"from rdkit import Chem\n", | ||
"\n", | ||
"import app.modules.toolkits.cdk_wrapper as cdk\n", | ||
"import app.modules.toolkits.rdkit_wrapper as rdkitmodules\n", | ||
"from app.modules.coconut.descriptors import get_COCONUT_descriptors\n", | ||
"from app.modules.toolkits.helpers import parse_input" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"id": "92da069b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_mol_block(input_text: str) -> str:\n", | ||
" \"\"\"Generate a Molblock from input text using CDK.\n", | ||
"\n", | ||
" Args:\n", | ||
" input_text (str): Input text (Mol/SMILES).\n", | ||
"\n", | ||
" Returns:\n", | ||
" str: Molblock representation.\n", | ||
"\n", | ||
" Raises:\n", | ||
" ValueError: If input_text is not a valid Mol or SMILES.\n", | ||
" \"\"\"\n", | ||
" check = rdkitmodules.is_valid_molecule(input_text)\n", | ||
"\n", | ||
" if check == \"smiles\":\n", | ||
" molecule = parse_input(input_text, \"cdk\", False)\n", | ||
" mol_block = cdk.get_CDK_SDG_mol(\n", | ||
" molecule,\n", | ||
" V3000=False,\n", | ||
" ).replace(\"$$$$\\n\", \"\")\n", | ||
" return mol_block\n", | ||
" elif check == \"mol\":\n", | ||
" return input_text\n", | ||
" else:\n", | ||
" return \"Error!, Check the input text.\"\n", | ||
"\n", | ||
"\n", | ||
"def get_molecule_hash(molecule: any) -> dict:\n", | ||
" \"\"\"Return various molecule hashes for the provided SMILES.\n", | ||
"\n", | ||
" Args:\n", | ||
" smiles (str): Standardized SMILES string.\n", | ||
"\n", | ||
" Returns:\n", | ||
" dict: Dictionary containing Formula, Isomeric SMILES, and Canonical SMILES.\n", | ||
" \"\"\"\n", | ||
" if molecule:\n", | ||
" Formula = Chem.rdMolDescriptors.CalcMolFormula(molecule)\n", | ||
" Isomeric_SMILES = Chem.MolToSmiles(molecule, kekuleSmiles=True)\n", | ||
" Canonical_SMILES = Chem.MolToSmiles(\n", | ||
" molecule,\n", | ||
" kekuleSmiles=True,\n", | ||
" isomericSmiles=False,\n", | ||
" )\n", | ||
" return {\n", | ||
" \"Formula\": Formula,\n", | ||
" \"Isomeric_SMILES\": Isomeric_SMILES,\n", | ||
" \"Canonical_SMILES\": Canonical_SMILES,\n", | ||
" }\n", | ||
" else:\n", | ||
" return {\"Error\": \"Check input SMILES\"}\n", | ||
"\n", | ||
"\n", | ||
"def get_representations(molecule: any) -> dict:\n", | ||
" \"\"\"Return COCONUT representations for the provided SMILES.\n", | ||
"\n", | ||
" Args:\n", | ||
" smiles (str): SMILES string.\n", | ||
"\n", | ||
" Returns:\n", | ||
" dict: Dictionary containing InChI, InChi Key, and Murko framework.\n", | ||
" \"\"\"\n", | ||
" if molecule:\n", | ||
" InChI = Chem.inchi.MolToInchi(molecule)\n", | ||
" InChI_Key = Chem.inchi.MolToInchiKey(molecule)\n", | ||
" cdkMolecule = parse_input(Chem.MolToSmiles(molecule), \"cdk\", False)\n", | ||
" Murko = cdk.get_murko_framework(cdkMolecule)\n", | ||
" return {\n", | ||
" \"standard_inchi\": InChI,\n", | ||
" \"standard_inchikey\": InChI_Key,\n", | ||
" \"murko_framework\": Murko,\n", | ||
" }\n", | ||
" else:\n", | ||
" return {\"Error\": \"Check input SMILES\"}\n", | ||
"\n", | ||
"\n", | ||
"def get_COCONUT_preprocessing(input_text: str) -> dict:\n", | ||
" \"\"\"Preprocess user input text suitable for the COCONUT database submission.\n", | ||
"\n", | ||
" data.\n", | ||
"\n", | ||
" Args:\n", | ||
" input_text (str): Input text (Mol/str).\n", | ||
"\n", | ||
" Returns:\n", | ||
" dict: COCONUT preprocessed data.\n", | ||
" \"\"\"\n", | ||
" original_mol = parse_input(input_text, \"rdkit\", False)\n", | ||
" try:\n", | ||
" original_mol = parse_input(input_text, \"rdkit\", False)\n", | ||
" if original_mol:\n", | ||
" original_mol_block = get_mol_block(input_text)\n", | ||
" original_mol_hash = get_molecule_hash(original_mol)\n", | ||
" original_representations = get_representations(original_mol)\n", | ||
" original_descriptors = get_COCONUT_descriptors(\n", | ||
" input_text,\n", | ||
" \"rdkit\",\n", | ||
" )\n", | ||
"\n", | ||
" standarised_mol_block = standardizer.standardize_molblock(original_mol_block)\n", | ||
" standardised_SMILES = Chem.MolToSmiles(\n", | ||
" Chem.MolFromMolBlock(standarised_mol_block),\n", | ||
" kekuleSmiles=True,\n", | ||
" )\n", | ||
"\n", | ||
" standardised_mol = parse_input(standardised_SMILES, \"rdkit\", False)\n", | ||
" standardised_molecule_hash = get_molecule_hash(standardised_mol)\n", | ||
" standardised_representations = get_representations(standardised_mol)\n", | ||
" standardised_descriptors = get_COCONUT_descriptors(\n", | ||
" standardised_SMILES,\n", | ||
" \"rdkit\",\n", | ||
" )\n", | ||
"\n", | ||
" parent_canonical_smiles = original_mol_hash[\"Canonical_SMILES\"]\n", | ||
" cdkParentMol = parse_input(parent_canonical_smiles, \"cdk\", False)\n", | ||
" parent_2D_molblock = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=False).replace(\n", | ||
" \"$$$$\\n\",\n", | ||
" \"\",\n", | ||
" )\n", | ||
" parent_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=True).replace(\n", | ||
" \"$$$$\\n\",\n", | ||
" \"\",\n", | ||
" )\n", | ||
" rdkitParentMol = parse_input(parent_canonical_smiles, \"rdkit\", False)\n", | ||
" parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol)\n", | ||
"\n", | ||
" parent_representations = get_representations(rdkitParentMol)\n", | ||
" parent_descriptors = get_COCONUT_descriptors(\n", | ||
" parent_canonical_smiles,\n", | ||
" \"rdkit\",\n", | ||
" )\n", | ||
"\n", | ||
" return {\n", | ||
" \"original\": {\n", | ||
" \"representations\": {\n", | ||
" \"2D_MOL\": original_mol_block,\n", | ||
" \"3D_MOL\": rdkitmodules.get_3d_conformers(original_mol),\n", | ||
" \"cannonical_smiles\": original_mol_hash[\"Isomeric_SMILES\"],\n", | ||
" \"standard_inchi\": original_representations[\"standard_inchi\"],\n", | ||
" \"standard_inchikey\": original_representations[\"standard_inchikey\"],\n", | ||
" \"murko_framework\": original_representations[\"murko_framework\"],\n", | ||
" },\n", | ||
" \"has_stereo\": rdkitmodules.has_stereochemistry(original_mol),\n", | ||
" \"descriptors\": original_descriptors,\n", | ||
" \"errors\": checker.check_molblock(original_mol_block),\n", | ||
" },\n", | ||
" \"standardized\": {\n", | ||
" \"representations\": {\n", | ||
" \"2D_MOL\": standarised_mol_block,\n", | ||
" \"3D_MOL\": rdkitmodules.get_3d_conformers(standardised_mol),\n", | ||
" \"cannonical_smiles\": standardised_SMILES,\n", | ||
" \"standard_inchi\": standardised_representations[\"standard_inchi\"],\n", | ||
" \"standard_inchikey\": standardised_representations[\"standard_inchikey\"],\n", | ||
" \"murko_framework\": standardised_representations[\"murko_framework\"],\n", | ||
" },\n", | ||
" \"has_stereo\": rdkitmodules.has_stereochemistry(standardised_mol),\n", | ||
" \"descriptors\": standardised_descriptors,\n", | ||
" \"errors\": checker.check_molblock(standarised_mol_block),\n", | ||
" },\n", | ||
" \"parent\": {\n", | ||
" \"representations\": {\n", | ||
" \"2D_MOL\": parent_2D_molblock,\n", | ||
" \"3D_MOL\": parent_3D_molblock,\n", | ||
" \"cannonical_smiles\": parent_canonical_smiles,\n", | ||
" \"standard_inchi\": parent_representations[\"standard_inchi\"],\n", | ||
" \"standard_inchikey\": parent_representations[\"standard_inchikey\"],\n", | ||
" \"murko_framework\": parent_representations[\"murko_framework\"],\n", | ||
" },\n", | ||
" \"has_stereo\": rdkitmodules.has_stereochemistry(rdkitParentMol),\n", | ||
" \"descriptors\": parent_descriptors,\n", | ||
" },\n", | ||
" }\n", | ||
" else:\n", | ||
" return {\"Error\": \"Check input SMILES\"}\n", | ||
" except InvalidInputException as e:\n", | ||
" return {\"Error\": f\"Invalid input: {e}\"}\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"id": "a527283c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"[13:35:20] SMILES Parse Error: syntax error while parsing: CC@C\n", | ||
"[13:35:20] SMILES Parse Error: Failed parsing SMILES 'CC@C' for input: 'CC@C'\n" | ||
] | ||
}, | ||
{ | ||
"ename": "InvalidInputException", | ||
"evalue": "", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mInvalidInputException\u001b[0m Traceback (most recent call last)", | ||
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_SMILES\u001b[0;34m(smiles, framework, standardize)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;31mInvalidInputException\u001b[0m: ", | ||
"\nDuring handling of the above exception, another exception occurred:\n", | ||
"\u001b[0;31mInvalidInputException\u001b[0m Traceback (most recent call last)", | ||
"\u001b[0;32m/tmp/ipykernel_129565/2648845104.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_COCONUT_preprocessing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CC@C\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | ||
"\u001b[0;32m/tmp/ipykernel_129565/3304377404.py\u001b[0m in \u001b[0;36mget_COCONUT_preprocessing\u001b[0;34m(input_text)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mCOCONUT\u001b[0m \u001b[0mpreprocessed\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \"\"\"\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0moriginal_mol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_text\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rdkit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0moriginal_mol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_text\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rdkit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_input\u001b[0;34m(input, framework, standardize)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"SMILES\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparse_SMILES\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mframework\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstandardize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m~/cheminformatics-microservice/app/modules/toolkits/helpers.py\u001b[0m in \u001b[0;36mparse_SMILES\u001b[0;34m(smiles, framework, standardize)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidInputException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"smiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | ||
"\u001b[0;31mInvalidInputException\u001b[0m: " | ||
] | ||
} | ||
], | ||
"source": [ | ||
"get_COCONUT_preprocessing(\"CC@C\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a141d996", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.