Skip to content

Commit

Permalink
Updated DrugMechDB parser to include all of the edges, instead of jus…
Browse files Browse the repository at this point in the history
…t drug targets
  • Loading branch information
beaslejt committed Oct 17, 2023
1 parent 08aa891 commit d32014c
Showing 1 changed file with 71 additions and 83 deletions.
154 changes: 71 additions & 83 deletions parsers/drugmechdb/src/loadDrugMechDB.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
import json
import requests as rq
import os
import pandas as pd

from Common.utils import GetData
from Common.loader_interface import SourceDataLoader
from Common.extractor import Extractor
from Common.kgxmodel import kgxnode, kgxedge

def load_json(json_data):
with open(json_data, encoding="utf-8") as file:
data = json.load(file)
file.close()
return data

# # Example usage
# json_file = 'indication_paths.json'
# csv_file = 'indication_paths.csv'
# data = load_json(json_file)

##############
# Class: Load in direct Gene/Protein-[biolink:target_for]->Disease relationships from DrugMechDB
# By: Jon-Michael Beasley
Expand All @@ -40,24 +34,22 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
# call the super
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)
self.drugmechdb_version = '202307' # TODO temporarily hard coded
#self.drugmechdb_version = self.get_latest_source_version()
#self.drugmechdb_version = '202307' # TODO temporarily hard coded
self.drugmechdb_version = self.get_latest_source_version()
self.drugmechdb_data_url = f"https://github.com/SuLab/DrugMechDB/raw/main/"
self.drugmechdb_file_name = f"indication_paths.json"
self.data_files = [self.drugmechdb_file_name]

#TODO Write the function below to get latest update version from https://sulab.github.io/DrugMechDB/
def get_latest_source_version(self) -> str:
"""
gets the latest version of the data
:return:
"""
if self.drugmechdb_version:
return self.drugmechdb_version
### The method below gets the database version from the html, but this may be subject to change. ###
drugmechdb_download_page_response = rq.get('https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp')
version_index = drugmechdb_download_page_response.text.index('BindingDB_All_2D_') + 17
bindingdb_version = drugmechdb_download_page_response.text[version_index:version_index + 6]
drugmechdb_download_page_response = rq.get('https://github.com/SuLab/DrugMechDB')
version_index = drugmechdb_download_page_response.text.index('<span class="css-truncate css-truncate-target text-bold mr-2" style="max-width: none;') + 87
bindingdb_version = drugmechdb_download_page_response.text[version_index:version_index + 5]
print(bindingdb_version)

return f"{bindingdb_version}"

Expand All @@ -72,97 +64,93 @@ def get_data(self) -> int:
data_puller.pull_via_http(source_url, self.data_path)
i+=1
return True

def process_node_to_kgx(self,node_id):
#self.logger.info(f'processing node: {node_identity}')
node_id = node_id.replace("InterPro:","interpro:").replace("UniProt:","UniProtKB:").replace("taxonomy:","NCBITaxon:").replace("reactome:","REACT:").replace("DB:","DRUGBANK:").replace("Pfam:","PFAM:").replace("\ufeff","")
node_to_write = kgxnode(node_id)
self.output_file_writer.write_kgx_node(node_to_write)
return node_id

def process_edge_to_kgx(self, subject_id: str, predicate: str, object_id: str, regulationType=None, complex_context=None):
if predicate:
if regulationType == None:
output_edge = kgxedge(
subject_id=subject_id,
object_id=object_id,
predicate=predicate,
primary_knowledge_source=self.provenance_id
)
else:
if regulationType == "positively":
direction = 'increased'
elif regulationType == "negatively":
direction = 'decreased'
output_edge = kgxedge(
subject_id=subject_id,
object_id=object_id,
predicate=predicate,
edgeprops={
'qualified_predicate':'biolink:causes',
'object_direction_qualifier':direction,
'object_aspect_qualifier':'expression',
},
primary_knowledge_source=self.provenance_id
)
self.output_file_writer.write_kgx_edge(output_edge)
else:
self.logger.warning(f'A predicate could not be mapped for relationship type {predicate}')
return

def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges
:return: ret_val: load_metadata
"""
triple_pair_dict = {
"dmdb_ids":[],
"drug_names":[],
"drug_meshs":[],
"drug_drugbanks":[],
"drug_target_names":[],
"drug_target_uniprots":[],
"disease_names":[],
"disease_meshs":[]
}

data = load_json(os.path.join(self.data_path,self.drugmechdb_file_name))
for entry in data:
dmdb_id = entry["graph"]["_id"]
drug_name = entry["graph"]["drug"]
#dmdb_id = entry["graph"]["_id"]
#drug_name = entry["graph"]["drug"]
drug_mesh = entry["graph"]["drug_mesh"]
drug_drugbank = entry["graph"]["drugbank"]
disease_name = entry["graph"]["disease"]
#drug_drugbank = entry["graph"]["drugbank"]
#disease_name = entry["graph"]["disease"]
disease_mesh = entry["graph"]["disease_mesh"]
links = entry["links"]

for i in range(len(links)):
triple = links[i]
if triple["source"] == drug_mesh:
source = triple["source"]
predicate = "biolink:" + triple["key"].replace(" ","_")
target = triple["target"]
source = triple["source"]
source_id = self.process_node_to_kgx(source)
predicate = "biolink:" + triple["key"].replace(" ","_")
target = triple["target"]
target_id = self.process_node_to_kgx(target)
self.process_edge_to_kgx(subject_id=source_id, predicate=predicate, object_id=target_id)

if source == drug_mesh:
nodes = entry["nodes"]
for node in nodes:
if (node["id"] == target) and (node["label"] == "Protein"):

drug_target_name = node["name"]
drug_target_uniprot = node["id"].replace('UniProt:', 'UniProtKB:')

triple_pair_dict["dmdb_ids"].append(dmdb_id)
triple_pair_dict["drug_names"].append(drug_name)
triple_pair_dict["drug_meshs"].append(drug_mesh)
triple_pair_dict["drug_drugbanks"].append(drug_drugbank)
triple_pair_dict["drug_target_names"].append(drug_target_name)
triple_pair_dict["drug_target_uniprots"].append(drug_target_uniprot)
triple_pair_dict["disease_names"].append(disease_name)
triple_pair_dict["disease_meshs"].append(disease_mesh)
#drug_target_name = node["name"]
drug_target_uniprot = node["id"]
drug_target_uniprot_id = self.process_node_to_kgx(drug_target_uniprot)
disease_mesh_id = self.process_node_to_kgx(disease_mesh)
self.process_edge_to_kgx(subject_id=drug_target_uniprot_id, predicate="biolink:target_for", object_id=disease_mesh_id)

# The section below checks the "Drug" + 1 node for drug metabolites, which may be the active molecule.
# Then, if the next node in the path is a protein, assign that as the target.
elif node["id"] == target and node["label"] in ["Drug","ChemicalSubstance"]:
if entry["links"][i+1]["source"] == node["id"]:
new_target = entry["links"][i+1]["target"]
for node in nodes:
if (node["id"] == new_target) and (node["label"] == "Protein"):
drug_target_name = node["name"]
drug_target_uniprot = node["id"].replace('UniProt:', 'UniProtKB:')

triple_pair_dict["dmdb_ids"].append(dmdb_id)
triple_pair_dict["drug_names"].append(drug_name)
triple_pair_dict["drug_meshs"].append(drug_mesh)
triple_pair_dict["drug_drugbanks"].append(drug_drugbank)
triple_pair_dict["drug_target_names"].append(drug_target_name)
triple_pair_dict["drug_target_uniprots"].append(drug_target_uniprot)
triple_pair_dict["disease_names"].append(disease_name)
triple_pair_dict["disease_meshs"].append(disease_mesh)
#drug_target_name = node["name"]
drug_target_uniprot = node["id"]
drug_target_uniprot_id = self.process_node_to_kgx(drug_target_uniprot)
disease_mesh_id = self.process_node_to_kgx(disease_mesh)
self.process_edge_to_kgx(subject_id=drug_target_uniprot_id, predicate="biolink:target_for", object_id=disease_mesh_id)

else:
continue
# print(len(triple_pair_dict["dmdb_ids"]))
# print(len(triple_pair_dict["drug_meshs"]))
# print(len(triple_pair_dict["drug_drugbanks"]))
# print(len(triple_pair_dict["drug_target_names"]))
# print(len(triple_pair_dict["drug_target_uniprots"]))
# print(len(triple_pair_dict["disease_meshs"]))
df = pd.DataFrame(triple_pair_dict)
print(len(df))
csv_file_name = os.path.join(self.data_path,"indication_paths.csv")
df.to_csv(csv_file_name)

#TODO Figure out how to parse the triple store as a dictionary
extractor = Extractor(file_writer=self.output_file_writer)
with open(csv_file_name, 'rt') as fp:
extractor.csv_extract(fp,
lambda line: line[6], # subject id
lambda line: line[8], # object id
lambda line: "biolink:target_for",
lambda line: {}, #Node 1 props
lambda line: {}, #Node 2 props
lambda line: {}, #Edge props
comment_character=None,
delim=",",
has_header_row=True
)
return extractor.load_metadata

return {}

0 comments on commit d32014c

Please sign in to comment.