diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py index ae6be9cb..1349b168 100644 --- a/parsers/BINDING/src/loadBINDINGDB.py +++ b/parsers/BINDING/src/loadBINDINGDB.py @@ -1,17 +1,19 @@ import os import enum import math -import zipfile as z +from zipfile import ZipFile as zipfile import requests as rq import pandas as pd from decimal import Decimal -from numpy import nan +import json +from collections import defaultdict from parsers.BINDING.src.bindingdb_constraints import LOG_SCALE_AFFINITY_THRESHOLD #Change the binding affinity threshold here. Default is 10 uM Ki,Kd,EC50,orIC50 from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor from Common.node_types import PUBLICATIONS +from Common.kgxmodel import kgxnode, kgxedge # Full Binding Data. @@ -20,42 +22,36 @@ class BD_EDGEUMAN(enum.IntEnum): PUBCHEM_CID= 29 UNIPROT_TARGET_CHAIN = 42 - KI = 8 - IC50 = 9 - KD = 10 - EC50 = 11 - KON = 12 - KOFF = 13 + pKi = 8 + pIC50 = 9 + pKd = 10 + pEC50 = 11 + k_on = 12 + k_off = 13 PMID = 19 PUBCHEM_AID = 20 - PATENT_NUMEBR = 21 + PATENT_NUMBER = 21 -def negative_log(score): ### This function converts nanomolar concentrations into log-scale units (pKi/pKd/pIC50/pEC50). ### - return -(math.log10(score*(10**-9))) +def negative_log(concentration_nm): ### This function converts nanomolar concentrations into log-scale units (pKi/pKd/pIC50/pEC50). ### + return -(math.log10(concentration_nm*(10**-9))) -def string_to_list(input, prefix: str): - #li = [prefix+item for item in input_list if not(math.isnan(item)) == True] - input = input.replace('[','').replace(']','') - li = input.split(", ") - li = [prefix+x for x in li if x != "nan"] - if li == None: - li = [] - return li +def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\\t'): + with zipfile(zip_file_path, 'r') as zip_file: + with zip_file.open(file_inside_zip, 'r') as file: + for line in file: + yield str(line).split(delimiter) def deduplicate_list(input_list): - return list(set(input_list)) + deduplicated_list = [] + seen_elements = set() -def generate_dataframe_rows(df): - """Generator function to yield each row of a pandas DataFrame. + for item in input_list: + if item not in seen_elements: + deduplicated_list.append(item) + seen_elements.add(item) - Args: - df (pandas.DataFrame): The DataFrame to iterate over. + return deduplicated_list - Yields: - pandas.Series: One row from the DataFrame as a pandas Series. - """ - for idx,row in df.iterrows(): - yield idx,row ############## # Class: Loading binding affinity measurements and sources from Binding-DB @@ -95,6 +91,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.bindingdb_data_url = [f"https://www.bindingdb.org/bind/downloads/"] self.BD_archive_file_name = f"BindingDB_All_{self.bindingdb_version}.tsv.zip" + self.BD_file_name = f"BindingDB_All.tsv" self.data_files = [self.BD_archive_file_name] def get_latest_source_version(self) -> str: @@ -130,284 +127,76 @@ def parse_data(self) -> dict: :return: ret_val: load_metadata """ + data_store= dict() + + columns = [[x.value,x.name] for x in BD_EDGEUMAN if x.name not in ['PMID','PUBCHEM_AID','PATENT_NUMBER','PUBCHEM_CID','UNIPROT_TARGET_CHAIN']] + n = 0 + for row in generate_zipfile_rows(os.path.join(self.data_path,self.BD_archive_file_name), self.BD_file_name): + if n == 0: + n+=1 + continue + if self.test_mode: + if n == 1000: + break + if n%100000 == 0: + self.logger.info(f'processed {n} rows so far...') + ligand = row[BD_EDGEUMAN.PUBCHEM_CID.value] + protein = row[BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value] + if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing. + n+=1 + continue + ligand_protein_key = f"{ligand}~{protein}" + # The section below checks through all of the previous entry keys and uses + found_key = False + index = None + if ligand_protein_key in data_store: #TODO start here + entry = data_store[ligand_protein_key] + found_key = True + else: + entry = {} + entry.update({'ligand':f"PUBCHEM.COMPOUND:{ligand}"}) + entry.update({'protein':f"UniProtKB:{protein}"}) - extractor = Extractor(file_writer=self.output_file_writer) - - dtype_dict= {BD_EDGEUMAN.KI.value:str, - BD_EDGEUMAN.IC50.value:str, - BD_EDGEUMAN.KD.value:str, - BD_EDGEUMAN.EC50.value:str, - BD_EDGEUMAN.KON.value:str, - BD_EDGEUMAN.KOFF.value:str, - BD_EDGEUMAN.PMID.value:str, - BD_EDGEUMAN.PUBCHEM_AID.value:str, - BD_EDGEUMAN.PATENT_NUMEBR.value:str, - BD_EDGEUMAN.PUBCHEM_CID.value:str, - BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value:str} - - data_archive_path = os.path.join(self.data_path, self.BD_archive_file_name) - table = pd.read_csv(data_archive_path, - usecols=[ - BD_EDGEUMAN.KI.value, #From now on, it is position 0 - BD_EDGEUMAN.IC50.value, #From now on, it is position 1 - BD_EDGEUMAN.KD.value, #From now on, it is position 2 - BD_EDGEUMAN.EC50.value, #From now on, it is position 3 - BD_EDGEUMAN.KON.value, #From now on, it is position 4 - BD_EDGEUMAN.KOFF.value, #From now on, it is position 5 - BD_EDGEUMAN.PMID.value, #From now on, it is position 6 - BD_EDGEUMAN.PUBCHEM_AID.value, #From now on, it is position 7 - BD_EDGEUMAN.PATENT_NUMEBR.value, #From now on, it is position 8 - BD_EDGEUMAN.PUBCHEM_CID.value, #From now on, it is position 9 - BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value, #From now on, it is position 10 - ], - sep="\t", - compression='zip', - dtype=dtype_dict - ) - - table = table.dropna(subset=["UniProt (SwissProt) Primary ID of Target Chain"]) - table = table.dropna(subset=["PubChem CID"]) - self.logger.info(f'About to group and aggregate ligand-protein pair data. This could take a while.') - table = table.groupby(["PubChem CID","UniProt (SwissProt) Primary ID of Target Chain"]).agg(list).reset_index() - self.logger.info(f'Done grouping data!') - # only keep 1000 entries for test mode - if self.test_mode: - table = table.head(1000) + publications = [x for x in [f"pmid:{row[BD_EDGEUMAN.PMID.value]}",f"pubchem_aid:{row[BD_EDGEUMAN.PUBCHEM_AID.value]}",f"patent:{row[BD_EDGEUMAN.PATENT_NUMBER.value]}"] if x not in ['pmid:','pubchem_aid:','patent:']] - measurements_list = [] - does_it_bind_list = [] - publications_store = [] - for idx,row in generate_dataframe_rows(table): - if idx%10000 == 0: - self.logger.info(f"Processed {idx} of {len(table)} rows so far.") - does_it_bind = False - measurements_dict = {} - pKi_measurements = [] - pKd_measurements = [] - pIC50_measurements = [] - pEC50_measurements = [] - kon_measurements = [] - koff_measurements = [] - publications_list = [] - for i in range(len(row["Ki (nM)"])): + for column in columns: - if type(row["Ki (nM)"][i]) == str: - measurement = {} + if row[column[0]] != '': + measure_type = column[1] + if measure_type not in entry.keys(): + entry.update({measure_type:[]}) try: - value = round(negative_log(float(Decimal(str(row["Ki (nM)"][i]).replace('>','').replace('<','')))),2) - if value > self.affinity_threshold: - does_it_bind = True - except: + if measure_type in ["k_on", "k_off"]: + value = round(float(row[column[0]].replace('>','').replace('<','').replace(' ','')),2) + elif measure_type in ["pKi", "pKd", "pIC50", "pEC50"]: + value = round(negative_log(float(row[column[0]].replace('>','').replace('<','').replace(' ',''))),2) + except Exception as e: + self.logger.info(f"Error:{e} on value: {row[column[0]]} {measure_type}") value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(patent) - publications_list.append(patent) - pKi_measurements = pKi_measurements + [measurement] - if type(row["Kd (nM)"][i]) == str: - measurement = {} - try: - value = round(negative_log(float(Decimal(str(row["Kd (nM)"][i]).replace('>','').replace('<','')))),2) - if value > self.affinity_threshold: - does_it_bind = True - except: - value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(patent) - publications_list.append(patent) - pKd_measurements = pKd_measurements + [measurement] - if type(row["IC50 (nM)"][i]) == str: - measurement = {} - try: - value = round(negative_log(float(Decimal(str(row["IC50 (nM)"][i]).replace('>','').replace('<','')))),2) - if value > self.affinity_threshold: - does_it_bind = True - except: - value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(patent) - publications_list.append(patent) - pIC50_measurements = pIC50_measurements + [measurement] + entry[measure_type].append({ + 'affinity':value, + 'publications':publications + }) - if type(row["EC50 (nM)"][i]) == str: - measurement = {} - try: - value = round(negative_log(float(Decimal(str(row["EC50 (nM)"][i]).replace('>','').replace('<','')))),2) - if value > self.affinity_threshold: - does_it_bind = True - except: - value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(patent) - pEC50_measurements = pEC50_measurements + [measurement] + if "publications" not in entry.keys(): + entry.update({'publications':[]}) + entry['publications'] = deduplicate_list(entry['publications'] + publications) - if type(row["kon (M-1-s-1)"][i]) == str: - measurement = {} - try: - value = round(float(Decimal(str(row["kon (M-1-s-1)"][i]).replace('>','').replace('<',''))),2) - except: - value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(patent) - publications_list.append(patent) - kon_measurements = kon_measurements + [measurement] - - if type(row["koff (s-1)"][i]) == str: - measurement = {} - try: - value = round(float(Decimal(str(row["koff (s-1)"][i]).replace('>','').replace('<',''))),2) - except: - value = "undefined" - measurement.update({"affinity":value}) - if type(row["PMID"][i]) == str: - pmid = "PMID:"+row["PMID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(pmid) - publications_list.append(pmid) - if type(row["PubChem AID"][i]) == str: - aid = "PUBCHEM.AID:"+row["PubChem AID"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(aid) - publications_list.append(aid) - if type(row["Patent Number"][i]) == str: - patent = "PATENT:"+row["Patent Number"][i] - if "publications" not in measurement.keys(): - measurement.update({"publications":[]}) - measurement["publications"].append(patent) - publications_list.append(patent) - koff_measurements = koff_measurements + [measurement] - - if pKi_measurements != []: - measurements_dict.update({"pKi":pKi_measurements}) - if pKd_measurements != []: - measurements_dict.update({"pKd":pKd_measurements}) - if pIC50_measurements != []: - measurements_dict.update({"pIC50":pIC50_measurements}) - if pEC50_measurements != []: - measurements_dict.update({"pEC50":pEC50_measurements}) - if kon_measurements != []: - measurements_dict.update({"kon(M-1-s-1)":pKi_measurements}) - if koff_measurements != []: - measurements_dict.update({"koff(s-1)":pKi_measurements}) - publications_store = publications_store + [deduplicate_list(publications_list)] - measurements_list = measurements_list + [measurements_dict] - if does_it_bind == True: - does_it_bind_list = does_it_bind_list + ["True"] + if found_key == True: + data_store[ligand_protein_key] = entry else: - does_it_bind_list = does_it_bind_list + ["False"] - - table['measurements'] = measurements_list - table['does_it_bind'] = does_it_bind_list - table['publications'] = publications_store - table = table[table['does_it_bind'] == "True"] - - pandas_output_file_path = os.path.join(self.data_path, f"BindingDB_temp_table.tsv") - table.to_csv(pandas_output_file_path, sep="\t",index=False) - - # def does_it_bind_filter(infile): - # yield next(infile) - # for line in infile: - # if(line.split('\t')[12])=="True": yield line - - with open(pandas_output_file_path, 'r') as fp: - extractor.csv_extract(fp, - lambda line: f'PUBCHEM.COMPOUND:{line[0]}', # subject id - lambda line: f'UniProtKB:{line[1]}', # object id - lambda line: "biolink:binds",#self.KI_predicate if ((line[0] != '') and (negative_log(float(Decimal(line[0].replace('>','').replace('<','')))) > self.KI_score_threshold)) else None, # predicate - lambda line: {}, #Node 1 props - lambda line: {}, #Node 2 props - lambda line: { - "measurements":line[11], - PUBLICATIONS:line[13] - }, - comment_character=None, - delim="\t", - has_header_row=True) - - #os.remove(pandas_output_file_path) + data_store.update({ligand_protein_key:entry}) + n+=1 + extractor = Extractor(file_writer=self.output_file_writer) + extractor.json_extract(data_store, + lambda item: data_store[item]['ligand'], # subject id + lambda item: data_store[item]['protein'], # object id + lambda item: "biolink:binds", + lambda item: {}, #Node 1 props + lambda item: {}, #Node 2 props + lambda item: {key:value for key,value in data_store[item].items() if key not in ['ligand','protein']} #Edge props + ) return extractor.load_metadata