Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add KLIFS drugs data #107

Merged
merged 10 commits into from
Sep 23, 2021
770 changes: 467 additions & 303 deletions docs/tutorials/databases_klifs.ipynb

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions opencadd/data/klifs_fields.csv
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,11 @@ coordinates,residue.klifs_id,Int32,,,
coordinates,residue.klifs_region_id,string,,,
coordinates,residue.klifs_region,string,,,
coordinates,residue.klifs_color,string,,,
drugs,drug.inn,string,INN,,
drugs,drug.brand_name,string,Brand name,,
drugs,drug.synonym,string,Synonyms,,
drugs,drug.phase,string,Phase,,
drugs,drug.approval_year,string,Approval,,
drugs,drug.smiles,string,SMILES,,
drugs,ligand.chembl_id,string,ChEMBL,,
drugs,ligand.expo_id,string,PDB,,
80 changes: 74 additions & 6 deletions opencadd/databases/klifs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,19 +138,25 @@ def _standardize_column_values(dataframe):
local data is already performed upon session initialization.
"""

# TODO Use None instead of "-"; but may affect downstream pipelines that use "-" already
if "structure.alternate_model" in dataframe.columns:
# Remote
dataframe["structure.alternate_model"].replace("", "-", inplace=True)
if "ligand.expo_id" in dataframe.columns:
# Remote
dataframe["ligand.expo_id"].replace(0, "-", inplace=True)
if "ligand_allosteric.expo_id" in dataframe.columns:
# Remote
dataframe["ligand_allosteric.expo_id"].replace(0, "-", inplace=True)
if "structure.resolution" in dataframe.columns:
# Remote
dataframe["structure.resolution"].replace(0, np.nan, inplace=True)

if "drug.brand_name" in dataframe.columns:
dataframe["drug.brand_name"] = dataframe["drug.brand_name"].apply(
lambda x: x.split(";") if x != "" else []
)
if "drug.synonyms" in dataframe.columns:
dataframe["drug.synonyms"] = dataframe["drug.synonyms"].apply(
lambda x: x.split("\t") if x != "" else []
)

return dataframe

def _standardize_dataframe(self, dataframe, columns, columns_mapping=None):
Expand Down Expand Up @@ -1046,10 +1052,13 @@ class InteractionsProvider(BaseProvider):
"""
Class for interactions requests.

Methods
-------
Properties
----------
interaction_types()
Get all available interaction types.

Methods
-------
all_interactions()
Get all available interaction fingerprints.
by_structure_klifs_id(structure_klifs_ids)
Expand Down Expand Up @@ -1428,3 +1437,62 @@ def _raise_invalid_extension(extension):
extensions = ["pdb", "mol2"]
if extension not in extensions:
raise ValueError(f"Invalid extension. Select from: {', '.join(extensions)}")


class DrugsProvider(BaseProvider):
"""
Class for drugs requests.

From the KLIFS Swagger API:
https://dev.klifs.net/swagger_v2/#/Ligands/get_drug_list
> The drug list endpoint returns a list of all annotated kinase ligands that are either
> approved or are/have been in clinical trials.
> This information is primarily powered by the PKIDB and complemented with KLIFS curation and
> annotation + manually curated data from other sources (e.g. approved INNs).

Methods
-------
all_drugs()
Get all available drugs.

Notes
-----
Class methods all return a pandas.DataFrame of drugs (rows) with the following attributes
(columns):

drug.inn : string
International nonproprietary name.
drug.brand_name : list of string
Brand name(s).
drug.synonym : list of string
Synonym(s).
drug.phase : string
Current clinical phase of the drug.
drug.approval_year : string
Year of FDA-approval.
If approval by another institution, syntax as follows, example: "2017 (EMA)".
drug.smiles : string
SMILES string of drug.
TODO: "ligand.smiles" would be more consistent with Ligand class, howover it is not
garanteed that SMILES will be the same for the same ligand, thus use "drug.smiles".
ligand.chembl_id : string
Ligand ChEMBL ID.
ligand.expo_id : string
Ligand expo ID.
"""

def all_drugs(self):
"""
Get all available drugs.

Returns
-------
pandas.DataFrame
drugs (rows) with the columns as defined in the class docstring.

Raises
------
ValueError
If DataFrame is empty.
"""
raise NotImplementedError("Implement in your subclass!")
15 changes: 15 additions & 0 deletions opencadd/databases/klifs/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
InteractionsProvider,
PocketsProvider,
CoordinatesProvider,
DrugsProvider,
)
from .schema import (
FIELDS,
Expand Down Expand Up @@ -879,3 +880,17 @@ def _add_residue_klifs_ids(self, dataframe, filepath):
dataframe = dataframe.merge(pocket_dataframe, on="residue.id", how="left")

return dataframe


class Drugs(LocalInitializer, DrugsProvider):
"""
Extends DrugsProvider to provide remote drug requests.
Refer to DrugsProvider documentation for more information:
opencadd.databases.klifs.core.DrugsProvider
"""

def all_drugs(self):

raise NotImplementedError(
"Information on drugs is not available locally! Please use a remote session."
)
21 changes: 21 additions & 0 deletions opencadd/databases/klifs/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
InteractionsProvider,
PocketsProvider,
CoordinatesProvider,
DrugsProvider,
)
from .schema import FIELDS
from .utils import metadata_to_filepath, silence_logging
Expand Down Expand Up @@ -869,3 +870,23 @@ def _add_residue_klifs_ids(self, dataframe, structure_klifs_id):
dataframe = dataframe.astype({"residue.klifs_id": "Int64"})

return dataframe


class Drugs(RemoteInitializer, DrugsProvider):
"""
Extends DrugsProvider to provide remote drug requests.
Refer to DrugsProvider documentation for more information:
opencadd.databases.klifs.core.DrugsProvider
"""

def all_drugs(self):

# Use KLIFS API
result = self._client.Ligands.get_drug_list().response().result
# Convert list of ABC objects to DataFrame
drugs = self._abc_to_dataframe(result)
# Standardize DataFrame
drugs = self._standardize_dataframe(
drugs, FIELDS.oc_name_to_type("drugs"), FIELDS.remote_to_oc_names("drugs")
)
return drugs
8 changes: 8 additions & 0 deletions opencadd/databases/klifs/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class Session:
Interactions object for interaction requests.
coordinates : None or opencadd.databases.klifs.remote.Coordinates/local.Coordinates
Coordinates object for coordinates requests.
drugs : None or opencadd.databases.klifs.remote.Drugs
Coordinates object for drugs requests.
"""

def __init__(self):
Expand All @@ -56,6 +58,7 @@ def __init__(self):
self.interactions = None
self.pockets = None
self.coordinates = None
self.drugs = None

@classmethod
def from_local(cls, path_to_klifs_download, path_to_klifs_metadata=None):
Expand Down Expand Up @@ -165,3 +168,8 @@ def _set_attributes(self, backend, path_to_klifs_download=None, database=None, c
database=database,
path_to_klifs_download=path_to_klifs_download,
)
self.drugs = backend.Drugs(
client=client,
database=database,
path_to_klifs_download=path_to_klifs_download,
)
2 changes: 1 addition & 1 deletion opencadd/databases/klifs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def metadata_to_filepath(
structure = f"{structure_pdb}{f'_alt{structure_alternate_model}' if bool(structure_alternate_model) else ''}{f'_chain{structure_chain}' if bool(structure_chain) else ''}"

# FIXME: The PDB download for ligands in KLIFS is named "klifs_ligand.pdb"
# instead of "ligand.expo_id". For the time being (until KLIFS maybe streamlines the file name
# instead of "ligand.pdb". For the time being (until KLIFS maybe streamlines the file name
# with all the other file names), rename the file here.
if entity == "ligand" and extension == "pdb":
entity = "klifs_ligand"
Expand Down
11 changes: 11 additions & 0 deletions opencadd/tests/databases/test_klifs_local_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,17 @@ def test_all_bioactivities(self):

check_dataframe(result_remote, FIELDS.oc_name_to_type("bioactivities"))

def test_all_drugs(self):
"""
Test request result for all drugs.
"""

result_remote = REMOTE.drugs.all_drugs()
check_dataframe(result_remote, FIELDS.oc_name_to_type("drugs"))

with pytest.raises(NotImplementedError):
LOCAL.drugs.all_drugs()


class TestsFromKinaseIds:
"""
Expand Down