diff --git a/.github/scripts/Bio_QC_check.py b/.github/scripts/Bio_QC_check.py new file mode 100644 index 00000000..4b6848ee --- /dev/null +++ b/.github/scripts/Bio_QC_check.py @@ -0,0 +1,46 @@ +import os +import requests + +PREDICATE_KEYWORDS = ["predicate", "biolink:", "edges"] +LABEL_NAME = "Biological Context QC" # Label to add if keywords are found + +# GitHub API variables +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +REPO_NAME = os.getenv("GITHUB_REPOSITORY") +ISSUE_NUMBER = os.getenv("ISSUE_NUMBER") +print("GITHUB_TOKEN:", GITHUB_TOKEN) +print("REPO_NAME:", REPO_NAME) +print("ISSUE_NUMBER:", ISSUE_NUMBER) + +headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"} +api_url = f"https://api.github.com/repos/{REPO_NAME}" + +def get_issue_details(issue_number): + response = requests.get(f"{api_url}/issues/{issue_number}", headers=headers) + response.raise_for_status() + return response.json() + +def add_label(issue_number, label_name): + response = requests.post( + f"{api_url}/issues/{issue_number}/labels", + headers=headers, + json={"labels": [label_name]} + ) + response.raise_for_status() + print(f"Label '{label_name}' added to issue/PR #{issue_number}") + +def check_keywords_in_text(text, keywords): + return any(keyword in text for keyword in keywords) + +def main(): + issue_details = get_issue_details(ISSUE_NUMBER) + title = issue_details["title"] + body = issue_details["body"] + + if check_keywords_in_text(title, PREDICATE_KEYWORDS) or check_keywords_in_text(body, PREDICATE_KEYWORDS): + add_label(ISSUE_NUMBER, LABEL_NAME) + else: + print("No predicate keywords found.") + +if __name__ == "__main__": + main() diff --git a/.github/workflows/label-predicate-changes.yml b/.github/workflows/label-predicate-changes.yml new file mode 100644 index 00000000..fd663254 --- /dev/null +++ b/.github/workflows/label-predicate-changes.yml @@ -0,0 +1,32 @@ +name: 'Label Predicate Changes' + +on: + pull_request: + types: [opened, edited, synchronize] + issues: + types: [opened, edited] + +jobs: + label_check: + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install PyGithub + + - name: Run predicate check + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ISSUE_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }} + run: | + python .github/scripts/Bio_QC_check.py \ No newline at end of file diff --git a/Common/build_manager.py b/Common/build_manager.py index 453cddf4..02cf2cee 100644 --- a/Common/build_manager.py +++ b/Common/build_manager.py @@ -21,10 +21,12 @@ from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PREDICATE, PUBLICATIONS from Common.meta_kg import MetaKnowledgeGraphBuilder, META_KG_FILENAME, TEST_DATA_FILENAME from Common.redundant_kg import generate_redundant_kg +from Common.collapse_qualifiers import generate_collapsed_qualifiers_kg NODES_FILENAME = 'nodes.jsonl' EDGES_FILENAME = 'edges.jsonl' REDUNDANT_EDGES_FILENAME = 'redundant_edges.jsonl' +COLLAPSED_QUALIFIERS_FILENAME = 'collapsed_qualifier_edges.jsonl' class GraphBuilder: @@ -124,6 +126,49 @@ def build_graph(self, graph_spec: GraphSpec): output_formats = graph_spec.graph_output_format.lower().split('+') if graph_spec.graph_output_format else [] nodes_filepath = os.path.join(graph_output_dir, NODES_FILENAME) edges_filepath = os.path.join(graph_output_dir, EDGES_FILENAME) + + if 'redundant_jsonl' in output_formats: + self.logger.info(f'Generating redundant edge KG for {graph_id}...') + redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) + generate_redundant_kg(edges_filepath, redundant_filepath) + + if 'redundant_neo4j' in output_formats: + self.logger.info(f'Generating redundant edge KG for {graph_id}...') + redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) + generate_redundant_kg(edges_filepath, redundant_filepath) + self.logger.info(f'Starting Neo4j dump pipeline for redundant {graph_id}...') + dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, + edges_filepath=redundant_filepath, + output_directory=graph_output_dir, + graph_id=graph_id, + graph_version=graph_version, + logger=self.logger) + + if dump_success: + graph_output_url = self.get_graph_output_url(graph_id, graph_version) + graph_metadata.set_dump_url(f'{graph_output_url}graph_{graph_version}_redundant.db.dump') + + if 'collapsed_qualifiers_jsonl' in output_formats: + self.logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') + collapsed_qualifiers_filepath = edges_filepath.replace(EDGES_FILENAME, COLLAPSED_QUALIFIERS_FILENAME) + generate_collapsed_qualifiers_kg(edges_filepath, collapsed_qualifiers_filepath) + + if 'collapsed_qualifiers_neo4j' in output_formats: + self.logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') + collapsed_qualifiers_filepath = edges_filepath.replace(EDGES_FILENAME, COLLAPSED_QUALIFIERS_FILENAME) + generate_collapsed_qualifiers_kg(edges_filepath, collapsed_qualifiers_filepath) + self.logger.info(f'Starting Neo4j dump pipeline for {graph_id} with collapsed qualifiers...') + dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, + edges_filepath=collapsed_qualifiers_filepath, + output_directory=graph_output_dir, + graph_id=graph_id, + graph_version=graph_version, + logger=self.logger) + + if dump_success: + graph_output_url = self.get_graph_output_url(graph_id, graph_version) + graph_metadata.set_dump_url(f'{graph_output_url}graph_{graph_version}_collapsed_qualifiers.db.dump') + if 'neo4j' in output_formats: self.logger.info(f'Starting Neo4j dump pipeline for {graph_id}...') dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, @@ -137,11 +182,6 @@ def build_graph(self, graph_spec: GraphSpec): graph_output_url = self.get_graph_output_url(graph_id, graph_version) graph_metadata.set_dump_url(f'{graph_output_url}graph_{graph_version}.db.dump') - if 'redundant_jsonl' in output_formats: - self.logger.info(f'Generating redundant edge KG for {graph_id}...') - redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) - generate_redundant_kg(edges_filepath, redundant_filepath) - return True # determine a graph version utilizing versions of data sources, or just return the graph version specified diff --git a/Common/collapse_qualifiers.py b/Common/collapse_qualifiers.py new file mode 100644 index 00000000..f3563a0a --- /dev/null +++ b/Common/collapse_qualifiers.py @@ -0,0 +1,171 @@ +try: + from tqdm import tqdm + TQDM_AVAILABLE = True +except ImportError: + TQDM_AVAILABLE = False + +from Common.biolink_constants import PREDICATE, QUALIFIED_PREDICATE, SUBJECT_DERIVATIVE_QUALIFIER, SUBJECT_FORM_OR_VARIANT_QUALIFIER, SUBJECT_PART_QUALIFIER, \ + SUBJECT_DIRECTION_QUALIFIER, SUBJECT_ASPECT_QUALIFIER, OBJECT_DERIVATIVE_QUALIFIER, OBJECT_FORM_OR_VARIANT_QUALIFIER, \ + OBJECT_PART_QUALIFIER, OBJECT_DIRECTION_QUALIFIER, OBJECT_ASPECT_QUALIFIER, CAUSAL_MECHANISM_QUALIFIER, \ + ANATOMICAL_CONTEXT_QUALIFIER, SPECIES_CONTEXT_QUALIFIER +from Common.biolink_utils import get_biolink_model_toolkit +from Common.utils import quick_jsonl_file_iterator +from Common.kgx_file_writer import KGXFileWriter + +### The goal of this script is to collapse the qualifiers, which are in edge properties, into a single statement, then replace the +### existing predicate label with the collapsed qualifier statement. + +### Call the biolink model toolkit to get the list of all qualifiers. This may change, but the way qualifiers are handled is currently hard-coded in this script. +bmt = get_biolink_model_toolkit() + +def write_edge_no_q(edge, predicate, qualifiers): + tmp_edge = edge.copy() + tmp_edge[PREDICATE] = f"{predicate}" + for qualifier in qualifiers.keys(): + tmp_edge.pop(qualifier, None) + return tmp_edge + +def aspect_qualifier_semantic_adjustment(aspect_qualifier): + # TODO check if other aspect qualifiers besides molecular interaction need to be treated differently. + if aspect_qualifier.split('_')[-1] == 'interaction': + aspect_conversion = aspect_qualifier + "_with" + else: + aspect_conversion = aspect_qualifier + "_of" + return aspect_conversion + +def form_or_variant_qualifier_semantic_adjustment(form_or_variant_qualifier): + # TODO check if other form_or_variant_qualifier qualifiers besides molecular interaction need to be treated differently. + form_or_variant_conversion = form_or_variant_qualifier + "_of" + return form_or_variant_conversion + +def causal_mechanism_qualifier_semantic_adjustment(causal_mechanism_qualifier): + # TODO check if other causal_mechanism qualifiers besides molecular interaction need to be treated differently. + causal_mechanism_qualifier = "via_"+ causal_mechanism_qualifier + return causal_mechanism_qualifier + +def species_context_qualifier_semantic_adjustment(species_context_qualifier): + species_context_qualifier = "in_"+ species_context_qualifier + return species_context_qualifier + +def anatomical_context_qualifier_semantic_adjustment(anatomical_context_qualifier, species_context_qualifier=False): + if species_context_qualifier == False: + anatomical_context_qualifier = "in_"+ anatomical_context_qualifier + return anatomical_context_qualifier + +def generate_collapsed_qualifiers_kg(infile, edges_file_path): + + with KGXFileWriter(edges_output_file_path=edges_file_path) as kgx_file_writer: + for edge in tqdm(quick_jsonl_file_iterator(infile)) if TQDM_AVAILABLE else quick_jsonl_file_iterator(infile): + + try: + edge_predicate = edge['predicate'] + except KeyError: + print(f"Collapsed Qualifiers Graph Failed - missing predicate on edge: {edge}") + break + + qualifiers = {key:value for key, value in edge.items() if bmt.is_qualifier(key)} + # Count the number of qualifiers and print a warning if number of qualifiers we handle in the next section doesn't match number of qualifiers detected. + # This will help warn us if new qualifiers are added in the future while giving us the option to still run the script as is. + qualifier_count = len(qualifiers.keys()) + counted_qualifiers = 0 + + # The following section crafts a new collapsed qualifier statement to replace the edge predicate, but needs to do some semantic adjustment. + # This is where to edit if the biolink model ever changes and handles qualifiers differently. + # Take guidance from: https://biolink.github.io/biolink-model/reading-a-qualifier-based-statement/ + # Example jsonl edge used here: {"subject":"UNII:7PK6VC94OU","predicate":"biolink:affects","object":"NCBIGene:6531","primary_knowledge_source":"infores:ctd","description":"decreases activity of","NCBITaxon":"9606","publications":["PMID:30776375"],"knowledge_level":"knowledge_assertion","agent_type":"manual_agent","subject_direction_qualifier":"increased","subject_aspect_qualifier":"abundance","subject_form_or_variant_qualifier":"mutant_form","subject_derivative_qualifier":"transcript","subject_part_qualifier":"polyA_tail","object_aspect_qualifier":"activity","object_direction_qualifier":"upregulated","object_form_or_variant_qualifier":"wildtype_form","object_derivative_qualifier":"protein","object_part_qualifier":"catalytic_site","causal_mechanism_qualifier":"phosyphorylation","species_context_qualifier":"human","anatomical_context_qualifier":"liver","qualified_predicate":"biolink:causes"} + + qualifier_statement = "" + + # Add on subject direction and aspect qualifiers first. eg. "increased_abundance_of_" + if SUBJECT_DIRECTION_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= qualifiers[SUBJECT_DIRECTION_QUALIFIER] + qualifier_statement+= "_" + if SUBJECT_ASPECT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= aspect_qualifier_semantic_adjustment(qualifiers[SUBJECT_ASPECT_QUALIFIER]) + qualifier_statement+= "_" + # Add on subject form_or_variant qualifiers. eg. "increased_abundance_of_mutant_form_of_" + if SUBJECT_FORM_OR_VARIANT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= form_or_variant_qualifier_semantic_adjustment(qualifiers[SUBJECT_FORM_OR_VARIANT_QUALIFIER]) + qualifier_statement+= "_" + # Add placeholder slot for subject node. eg. "increased_abundance_of_mutant_form_of_" + qualifier_statement+= "_" + # Add on subject derivative and part qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail" + if SUBJECT_DERIVATIVE_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= qualifiers[SUBJECT_DERIVATIVE_QUALIFIER] + qualifier_statement+= "_" + if SUBJECT_PART_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= qualifiers[SUBJECT_PART_QUALIFIER] + qualifier_statement+= "_" + + # Add the qualified predicate. eg. "increased_abundance_of_mutant_form_of__transcript_poly_A_tail_causes" + if QUALIFIED_PREDICATE in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= qualifiers[QUALIFIED_PREDICATE].replace("biolink:","") + qualifier_statement+= "_" + + # Add on object direction and aspect qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of" + if OBJECT_DIRECTION_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= qualifiers[OBJECT_DIRECTION_QUALIFIER] + qualifier_statement+= "_" + if OBJECT_ASPECT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= aspect_qualifier_semantic_adjustment(qualifiers[OBJECT_ASPECT_QUALIFIER]) + qualifier_statement+= "_" + # Add on object form_or_variant qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of" + if OBJECT_FORM_OR_VARIANT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= form_or_variant_qualifier_semantic_adjustment(qualifiers[OBJECT_FORM_OR_VARIANT_QUALIFIER]) + qualifier_statement+= "_" + # Add placeholder slot for object node. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of_" + qualifier_statement+= "" + + # Add on object derivative and part qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of__protein_catalytic_site" + # Need to start putting "_" before each qualifier as any given one could be the last in the statement. + if OBJECT_DERIVATIVE_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= "_" + qualifier_statement+= qualifiers[OBJECT_DERIVATIVE_QUALIFIER] + if OBJECT_PART_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= "_" + qualifier_statement+= qualifiers[OBJECT_PART_QUALIFIER] + + # Add on mechanism qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of__protein_catalytic_site_via_phosphorylation" + if CAUSAL_MECHANISM_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= "_" + qualifier_statement+= causal_mechanism_qualifier_semantic_adjustment(qualifiers[CAUSAL_MECHANISM_QUALIFIER]) + + # Add on species qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of__protein_catalytic_site_via_phosphorylation_in_human" + if SPECIES_CONTEXT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= "_" + qualifier_statement+= species_context_qualifier_semantic_adjustment(qualifiers[SPECIES_CONTEXT_QUALIFIER]) + + # Add on anatomical context qualifiers. eg. "increased_abundance_of_mutant_form_of_transcript_poly_A_tail_causes_upregulated_activity_of_mutant_form_of__protein_catalytic_site_via_phosphorylation_in_human_liver" + if ANATOMICAL_CONTEXT_QUALIFIER in qualifiers.keys(): + counted_qualifiers+= 1 + qualifier_statement+= "_" + if SPECIES_CONTEXT_QUALIFIER in qualifiers.keys(): + species_qualifier = True + else: + species_qualifier = False + qualifier_statement+= anatomical_context_qualifier_semantic_adjustment(qualifiers[ANATOMICAL_CONTEXT_QUALIFIER], species_qualifier) + + if counted_qualifiers < qualifier_count: + print(f"Qualifiers on edge: {edge} are not all being handled correctly. Please revise collapse_qualifiers.py to handle all qualifiers.") + + # Either rewrite the original edge if no qualifier collapsing happened, or rewrite with new predicate from qualifier_statement. + edges_to_write = [] + if qualifier_statement != "": + edges_to_write.append(write_edge_no_q(edge, qualifier_statement, qualifiers)) + else: + edges_to_write.append(edge) + + kgx_file_writer.write_normalized_edges(edges_to_write) diff --git a/Common/data_sources.py b/Common/data_sources.py index 082cf923..0aeb437e 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -19,6 +19,7 @@ HMDB = 'HMDB' HUMAN_GOA = 'HumanGOA' INTACT = 'IntAct' +LINCS = 'LINCS' LITCOIN = 'LitCoin' LITCOIN_SAPBERT = 'LitCoinSapBERT' LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor' @@ -33,6 +34,7 @@ REACTOME = 'Reactome' SCENT = 'Scent' SGD = 'SGD' +SIGNOR = 'SIGNOR' HUMAN_STRING = 'STRING-DB-Human' TEXT_MINING_KP = 'textminingkp' UBERGRAPH_NONREDUNDANT = 'UbergraphNonredundant' @@ -67,6 +69,7 @@ HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), INTACT: ("parsers.IntAct.src.loadIA", "IALoader"), + LINCS: ("parsers.LINCS.src.loadLINCS", "LINCSLoader"), LITCOIN: ("parsers.LitCoin.src.loadLitCoin", "LitCoinLoader"), LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"), LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"), @@ -81,6 +84,7 @@ REACTOME: ("parsers.Reactome.src.loadReactome", "ReactomeLoader"), SCENT: ("parsers.scent.src.loadScent", "ScentLoader"), SGD: ("parsers.SGD.src.loadSGD", "SGDLoader"), + SIGNOR: ("parsers.SIGNOR.src.loadSIGNOR", "SIGNORLoader"), TEXT_MINING_KP: ("parsers.textminingkp.src.loadTMKP", "TMKPLoader"), UBERGRAPH_NONREDUNDANT: ("parsers.UberGraph.src.loadUG", "UGLoader"), UBERGRAPH_REDUNDANT: ("parsers.UberGraph.src.loadUG", "UGRedundantLoader"), diff --git a/Common/prefixes.py b/Common/prefixes.py index c7018410..df2e2040 100644 --- a/Common/prefixes.py +++ b/Common/prefixes.py @@ -48,6 +48,7 @@ ORPHANET='ORPHANET' PUBCHEM_COMPOUND='PUBCHEM.COMPOUND' PUBMED='PMID' +RNACENTRAL='RNACENTRAL' UBERON='UBERON' UNIPROTKB='UniProtKB' UMLS='UMLS' diff --git a/graph_specs/signor-graph-spec.yml b/graph_specs/signor-graph-spec.yml new file mode 100644 index 00000000..280b4e54 --- /dev/null +++ b/graph_specs/signor-graph-spec.yml @@ -0,0 +1,9 @@ +graphs: + + - graph_id: SIGNOR + graph_name: 'SIGNOR 3.0: The SIGnaling Network Open Resource' + graph_description: 'SIGNOR is a repository of manually annotated causal relationships between human proteins, chemicals of biological relevance, stimuli and phenotypes.' + graph_url: https://signor.uniroma2.it + output_format: neo4j + sources: + - source_id: SIGNOR \ No newline at end of file diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py index 9b2b0db1..042c240c 100644 --- a/parsers/BINDING/src/loadBINDINGDB.py +++ b/parsers/BINDING/src/loadBINDINGDB.py @@ -11,7 +11,8 @@ from Common.utils import GetData, GetDataPullError from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.biolink_constants import PUBLICATIONS, AFFINITY, AFFINITY_PARAMETER, KNOWLEDGE_LEVEL, AGENT_TYPE, KNOWLEDGE_ASSERTION, MANUAL_AGENT +from Common.biolink_constants import PUBLICATIONS, AFFINITY, AFFINITY_PARAMETER, KNOWLEDGE_LEVEL, AGENT_TYPE, \ + KNOWLEDGE_ASSERTION, MANUAL_AGENT # Full Binding Data. diff --git a/parsers/CTD/src/loadCTD.py b/parsers/CTD/src/loadCTD.py index 608f561d..9bd9e75c 100644 --- a/parsers/CTD/src/loadCTD.py +++ b/parsers/CTD/src/loadCTD.py @@ -533,7 +533,7 @@ def convert_predicates(predicate): :return: """ # the capture regex - regex = '\/|\ |\^' + regex = r'\/|\ |\^' # clean up the predicate cleaned_predicate = re.sub(regex, '_', predicate) diff --git a/parsers/GenomeAlliance/src/loadGenomeAlliance.py b/parsers/GenomeAlliance/src/loadGenomeAlliance.py index 9027f519..059b86d6 100644 --- a/parsers/GenomeAlliance/src/loadGenomeAlliance.py +++ b/parsers/GenomeAlliance/src/loadGenomeAlliance.py @@ -2,6 +2,7 @@ import os import enum import gzip +import requests from Common.utils import GetData from Common.loader_interface import SourceDataLoader @@ -36,8 +37,11 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.latest_version = None self.latest_version = self.get_latest_source_version() - self.genome_alliance_url = f'https://download.alliancegenome.org/{self.get_latest_source_version()}/ORTHOLOGY-ALLIANCE/COMBINED/' - self.genome_alliance_ortholog_file = 'ORTHOLOGY-ALLIANCE_COMBINED_25.tsv.gz' + #self.genome_alliance_url = f'https://download.alliancegenome.org/{self.get_latest_source_version()}/ORTHOLOGY-ALLIANCE/COMBINED/' + #self.genome_alliance_ortholog_file = 'ORTHOLOGY-ALLIANCE_COMBINED_25.tsv.gz' + + self.genome_alliance_url = 'https://fms.alliancegenome.org/download/' + self.self.genome_alliance_ortholog_file = 'ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz' self.data_files = [self.genome_alliance_ortholog_file] def get_latest_source_version(self) -> str: @@ -46,8 +50,11 @@ def get_latest_source_version(self) -> str: :return: """ - if not self.latest_version: - self.latest_version = '5.3.0' + #if not self.latest_version: + # self.latest_version = '5.3.0' + + self.latest_version = requests.get("https://www.alliancegenome.org/api/releaseInfo").json()['releaseVersion'] + return self.latest_version def get_data(self) -> int: diff --git a/parsers/KinAce/src/loadKinAce.py b/parsers/KinAce/src/loadKinAce.py index 62175f05..e66b373a 100644 --- a/parsers/KinAce/src/loadKinAce.py +++ b/parsers/KinAce/src/loadKinAce.py @@ -1,37 +1,43 @@ import os import enum -from zipfile import ZipFile as zipfile -import pandas as pd +import requests from Common.utils import GetData from Common.loader_interface import SourceDataLoader from Common.extractor import Extractor -from Common.biolink_constants import PUBLICATIONS +from Common.biolink_constants import * -# Full Kinase-Substrate Phosphorylation Data. - -#make this reflect the column that the data is found in -class BD_EDGEUMAN(enum.IntEnum): - KINASE = 1 - SUBSTRATE = 2 - P_SITE = 3 - PRIMARY_SOURCE = 4 - SECONDARY_SOURCE = 5 ############## # Class: Loading kinase-substrate phosphorylation reactions from KinAce # By: Jon-Michael Beasley # Date: 03/7/2024 ############## + +class DATACOLS(enum.IntEnum): + kinase = 0 + substrate = 2 + p_site = 4 + primary_source = 5 + PUBLICATIONS = 7 + + class KinAceLoader(SourceDataLoader): source_id: str = 'KinAce' provenance_id: str = 'infores:kinace' - description = "The KinAce web portal aggregates and visualizes the network of interactions between protein-kinases and their substrates in the human genome." + description = ("The KinAce web portal aggregates and visualizes the network of interactions between " + "protein-kinases and their substrates in the human genome.") source_data_url = "https://kinace.kinametrix.com/session/ff792906de38db0d1c9900ac5882497b/download/download0?w=" - license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'" + license = "Creative Commons Attribution 4.0 International" attribution = 'https://kinace.kinametrix.com/#section-about' - parsing_version = '1.0' + parsing_version = '1.2' + + KINACE_INFORES_MAPPING = { + 'PhosphoSitePlus': 'infores:psite-plus', + 'EPSD': 'infores:epsd', + 'iPTMNet': 'infores:iptmnet' + } def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -41,12 +47,12 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): # call the super super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.kinace_version = "2023-10-30" - #self.kinace_version = self.get_latest_source_version() - self.kinace_data_url = f"https://raw.githubusercontent.com/GauravPandeyLab/KinAce/master/data/{self.kinace_version}-kinace-dataset.zip" - - self.archive_file_name = f"{self.kinace_version}-kinace-dataset.zip" - self.interactions_file_name = f"ksi_source.csv" + # self.kinace_version = "2023-10-30" + # KinAce downloaded this data on the 30th of October 2023. However, they have made changes to the files since + # I suggest using the last commit date of the file to version this data set + self.kinace_data_url = f"https://github.com/GauravPandeyLab/KiNet/raw/master/data/ksi_source_full_dataset.csv" + # Let's use the full source for completeness rather than the pruned list + self.interactions_file_name = f"ksi_source_full_dataset.csv" self.data_files = [self.interactions_file_name] def get_latest_source_version(self) -> str: @@ -54,10 +60,14 @@ def get_latest_source_version(self) -> str: gets the latest version of the data :return: """ - if self.kinace_version: - return self.kinace_version - - return f"{self.kinace_version}" + url = (f"https://api.github.com/repos/GauravPandeyLab/KiNet/commits?" + f"path=./data/{self.interactions_file_name}&per_page=1") + response = requests.get(url) + commits = response.json() + last_commit_date = commits[0]['commit']['committer']['date'] + date_version = last_commit_date[:10] + + return f"{date_version}" def get_data(self) -> int: """ @@ -67,10 +77,34 @@ def get_data(self) -> int: data_puller = GetData() source_url = f"{self.kinace_data_url}" data_puller.pull_via_http(source_url, self.data_path) - with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref: - zip_ref.extract(self.interactions_file_name, self.data_path) + return True + def get_pmids(self, line): + publication_list = [] + + if line[DATACOLS.PUBLICATIONS.value] in ['', 'NA']: + return publication_list + + ids = line[DATACOLS.PUBLICATIONS.value].split(';') + publication_list = ['PMID:' + i.strip() for i in ids if i.strip()] + + return publication_list + + def get_KL_AT_assignments(self, line): + knowledge_level = NOT_PROVIDED + agent_type = NOT_PROVIDED + if line[DATACOLS.primary_source.value] == 'PhosphoSitePlus': + knowledge_level = KNOWLEDGE_ASSERTION + agent_type = MANUAL_AGENT + elif line[DATACOLS.primary_source.value] == 'EPSD': + knowledge_level = NOT_PROVIDED + agent_type = NOT_PROVIDED + elif line[DATACOLS.primary_source.value] == 'iPTMNet': + knowledge_level = NOT_PROVIDED + agent_type = TEXT_MINING_AGENT + return [knowledge_level, agent_type] + def parse_data(self) -> dict: """ Parses the data file for graph nodes/edges @@ -78,35 +112,27 @@ def parse_data(self) -> dict: :return: ret_val: load_metadata """ - print('ok parsing') - # with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref: - # zip_ref.extract(self.interactions_file_name, self.data_path) - data = pd.read_csv(os.path.join(self.data_path, self.interactions_file_name)) - data = data.groupby(["Kinase", "Substrate"]).agg({"Site": list, "PrimarySource": list, "SecondarySource": list}).reset_index() - # Define a function to deduplicate lists - def deduplicate_list(lst): - lst = [x for x in lst if x == x] - return list(set(lst)) - # Apply deduplication function to each aggregated list - data['Site'] = data.apply(lambda row: list(set([x for x in row['Site'] if x==x])), axis=1) - data['PrimarySource'] = data.apply(lambda row: list(set([x for x in row['PrimarySource'] if x==x])), axis=1) - data['SecondarySource'] = data.apply(lambda row: list(set([x for x in row['SecondarySource'] if x==x])), axis=1) - data.to_csv(os.path.join(self.data_path, self.interactions_file_name)) + extractor = Extractor(file_writer=self.output_file_writer) - with open(os.path.join(self.data_path, self.interactions_file_name), 'rt') as fp: - extractor.csv_extract(fp, - lambda line: f"UniProtKB:{line[1]}", # subject id - lambda line: f"UniProtKB:{line[2]}", # object id - lambda line: "biolink:phosphorylates", # predicate - lambda line: {}, #Node 1 props - lambda line: {}, #Node 2 props - lambda line: { - 'phosphorylation_sites':line[3], - 'primary_sources':line[4], - 'secondary_sources':line[5] - }, #Edge props - comment_character=None, - delim=",", - has_header_row=True - ) - return extractor.load_metadata \ No newline at end of file + + with open(os.path.join(self.data_path, self.interactions_file_name)) as csvfile: + # change to csv reader + extractor.csv_extract(csvfile, + subject_extractor=lambda line: f"UniProtKB:{line[DATACOLS.kinase.value]}", + object_extractor=lambda line: f"UniProtKB:{line[DATACOLS.substrate.value]}", + predicate_extractor=lambda line: "biolink:affects", # predicate + edge_property_extractor=lambda line: + {QUALIFIED_PREDICATE: 'biolink:causes', + OBJECT_DIRECTION_QUALIFIER: 'increased', + OBJECT_ASPECT_QUALIFIER: 'phosphorylation', + 'phosphorylation_sites': [line[DATACOLS.p_site.value]], + KNOWLEDGE_LEVEL: self.get_KL_AT_assignments(line)[0], + AGENT_TYPE: self.get_KL_AT_assignments(line)[1], + PRIMARY_KNOWLEDGE_SOURCE: + self.KINACE_INFORES_MAPPING.get(line[DATACOLS.primary_source.value], None), + AGGREGATOR_KNOWLEDGE_SOURCES: [self.provenance_id], + PUBLICATIONS: self.get_pmids(line)}, + has_header_row=True, + delim=',' + ) + return extractor.load_metadata diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py new file mode 100644 index 00000000..f71e16a7 --- /dev/null +++ b/parsers/LINCS/src/loadLINCS.py @@ -0,0 +1,95 @@ +import os +import enum + +from Common.extractor import Extractor +from Common.loader_interface import SourceDataLoader +from Common.biolink_constants import * +from Common.prefixes import PUBCHEM_COMPOUND +from Common.utils import GetData + + +class GENERICDATACOLS(enum.IntEnum): + SOURCE_ID = 2 + SOURCE_LABEL = 3 + TARGET_ID = 5 + TARGET_LABEL = 6 + PREDICATE = 7 + + +PREDICATE_MAPPING = { + "in_similarity_relationship_with": "biolink:chemically_similar_to", + "negatively_regulates": "RO:0002212", + "positively_regulates": "RO:0002213" +} + + +############## +# Class: LINCS loader +# +# By: James Chung +# Date: 10/30/2024 +# Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures. +# +############## +class LINCSLoader(SourceDataLoader): + + source_id: str = 'LINCS' + provenance_id: str = 'infores:lincs' + parsing_version: str = '1.0' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.data_url = 'https://stars.renci.org/var/data_services/LINCS/' + self.edge_file = "LINCS.lookup.edges.csv" + self.data_files = [self.edge_file] + + def get_latest_source_version(self) -> str: + # The KG was generated from Data Distillery KG. There was no version defined. + latest_version = 'v1.0' + return latest_version + + def get_data(self) -> bool: + source_data_url = f'{self.data_url}{self.edge_file}' + data_puller = GetData() + data_puller.pull_via_http(source_data_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + + :return: ret_val: load_metadata + """ + extractor = Extractor(file_writer=self.output_file_writer) + lincs_file: str = os.path.join(self.data_path, self.edge_file) + with open(lincs_file, 'rt') as fp: + extractor.csv_extract(fp, + lambda line: self.resolve_id(line[GENERICDATACOLS.SOURCE_ID.value]), # source id + lambda line: self.resolve_id(line[GENERICDATACOLS.TARGET_ID.value]), # target id + lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]], # predicate extractor + lambda line: {}, # subject properties + lambda line: {}, # object properties + lambda line: self.get_edge_properties(), # edge properties + comment_character='#', + delim=',', + has_header_row=True) + return extractor.load_metadata + + @staticmethod + def resolve_id(idstring: str): + if idstring.startswith("PUBCHEM"): + return idstring.replace("PUBCHEM", PUBCHEM_COMPOUND) + return idstring + + def get_edge_properties(self): + properties = { + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: DATA_PIPELINE + } + return properties diff --git a/parsers/PHAROS/src/loadPHAROS.py b/parsers/PHAROS/src/loadPHAROS.py index 6fc62bef..d1ddd298 100644 --- a/parsers/PHAROS/src/loadPHAROS.py +++ b/parsers/PHAROS/src/loadPHAROS.py @@ -1,7 +1,6 @@ import os import argparse import re -import requests from Common.loader_interface import SourceDataLoader, SourceDataBrokenError, SourceDataFailedError from Common.kgxmodel import kgxnode, kgxedge @@ -94,6 +93,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.source_db = 'Target Central Resource Database' self.pharos_db = None self.genetic_association_predicate = 'WIKIDATA_PROPERTY:P2293' + self.target_for_predicate = "biolink:target_for" def get_latest_source_version(self) -> str: """ @@ -216,9 +216,13 @@ def parse_gene_to_disease(self) -> (int, int): self.output_file_writer.write_kgx_node(gene_node) if edge_provenance: + if edge_provenance == "infores:drugcentral": + assigned_predicate = self.target_for_predicate + else: + assigned_predicate = self.genetic_association_predicate gene_to_disease_edge = kgxedge(subject_id=gene_id, object_id=disease_id, - predicate=self.genetic_association_predicate, + predicate=assigned_predicate, edgeprops=edge_properties, primary_knowledge_source=edge_provenance, aggregator_knowledge_sources=[self.provenance_id]) @@ -381,11 +385,11 @@ def get_edge_props(self, result) -> (str, list, dict, str): # if there was affinity data save it affinity = result['affinity'] if affinity is not None and affinity != '': - props['affinity'] = float(affinity) + props[AFFINITY] = float(affinity) affinity_paramater = result['affinity_parameter'] if affinity_paramater: - props['affinity_parameter'] = f'p{result["affinity_parameter"]}' + props[AFFINITY_PARAMETER] = f'p{result["affinity_parameter"]}' # return to the caller return predicate, pmids, props, provenance diff --git a/parsers/SIGNOR/src/loadSIGNOR.py b/parsers/SIGNOR/src/loadSIGNOR.py new file mode 100644 index 00000000..ed615314 --- /dev/null +++ b/parsers/SIGNOR/src/loadSIGNOR.py @@ -0,0 +1,487 @@ + +import csv +import os +import re +import json +import enum +import requests as rq + +from requests_toolbelt.multipart.encoder import MultipartEncoder + +from Common.extractor import Extractor +from Common.biolink_constants import * +from Common.prefixes import * +from Common.utils import GetData +from Common.loader_interface import SourceDataLoader + +from parsers.SIGNOR.src.signor_mechanism_predicate_mapping import * + + +class DATACOLS(enum.IntEnum): + """ + An enumeration class representing column indices for data attributes in SIGNOR. + + Each attribute corresponds to a specific column and the values + represent the index of that column. + """ + SUBJECT = 0 + SUBJECT_TYPE = 1 + SUBJECT_ID = 2 + SUBJECT_DATABASE = 3 + + OBJECT = 4 + OBJECT_TYPE = 5 + OBJECT_ID = 6 + OBJECT_DATABASE = 7 + + EFFECT = 8 + MECHANISM = 9 + + AA_MODIFIED = 10 + SEQUENCE = 11 + + TAXON = 12 + CELL_TYPE = 13 + TISSUE_TYPE = 14 + + MODULAR_COMPLEX = 15 # unused + TARGET_COMPLEX = 16 # unused + + MODIFICATION_A = 17 # unused + MODIFICATION_SEQUENCE_A = 18 # unused + + MODIFICATION_B = 19 # unused + MODIFICATION_SEQUENCE_B = 20 # unused + + PUBLICATIONS = 21 + DESCRIPTION = 25 + + +class SIGNORLoader(SourceDataLoader): + """ + Data loader class for SIGNOR + """ + source_id: str = 'SIGNOR' + provenance_id: str = 'infores:signor' + description = ("Signor 3.0 is a resource that annotates experimental evidence about causal interactions between " + "proteins and other entities of biological relevance: stimuli, phenotypes, enzyme inhibitors, " + "complexes, protein families etc. ") + source_data_url = "https://signor.uniroma2.it/download_entity.php" + license = ("SIGNOR is licensed under a Creative Commons Attribution-NonCommercial 4.0 International " + "(CC BY-NC 4.0) license.") + attribution = 'https://signor.uniroma2.it/about/' + parsing_version = '1.2' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + constructor + :param test_mode - sets the run into test mode + """ + # call the super + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.signor_data_url = "https://signor.uniroma2.it/releases/getLatestRelease.php" + self.signor_mapping_download = "https://signor.uniroma2.it/download_complexes.php" + self.signor_cv_download = "https://signor.uniroma2.it/download_signor_def.php" + self.signor_pathways_download = "https://signor.uniroma2.it/scripts/GMTGenerator.php" + + self.signor_phenotypes_filename = "SIGNOR-phenotype.csv" + self.signor_stimuli_filename = "SIGNOR-stimulus.csv" + self.signor_complex_filename = "SIGNOR-complex.csv" + self.signor_proteinfamily_filename = "SIGNOR-proteinfamily.csv" + self.signor_pathways_filename = "SIGNOR-pathways.tsv" + self.signor_mechanisms_filename = "SIGNOR-mechanisms.csv" + self.signor_modifications_filename = "SIGNOR-modifications.csv" + self.signor_data_file = "signor_data.json" + + self.signor_version = self.get_latest_source_version() + self.signor_file_name = "getLatestRelease.php" + self.data_files = [self.signor_file_name, + self.signor_phenotypes_filename, + self.signor_stimuli_filename, + self.signor_complex_filename, + self.signor_proteinfamily_filename, + self.signor_pathways_filename, + self.signor_mechanisms_filename, + self.signor_modifications_filename + ] + + def get_latest_source_version(self) -> str: + """ + gets the latest version of the data + :return: + """ + + # The method below gets the database version from the html, but this may be subject to change. + signor_download_page_response = rq.post(self.signor_data_url) + file_name = signor_download_page_response.headers['Content-Disposition'] + file_name = file_name.replace("attachment; filename=", "").replace("_release.txt", + "").replace('"', '') + return file_name + + def get_data(self) -> int: + """ + Gets the SIGNOR 3.0 data. + Must send some complex data and headers, because it's php with a form with buttons on it, + which is why MultipartEncoder is used. + """ + + data_puller = GetData() + file_count = 0 + for source in self.data_files: + if source == self.signor_phenotypes_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download phenotype data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_phenotypes_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_stimuli_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download stimulus data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_stimuli_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_complex_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download complex data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_complex_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_proteinfamily_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download protein family data")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_mapping_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_proteinfamily_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_mechanisms_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Mechansims CV")}) + # Mechanism is misspelled on the SIGNOR website. If they fix their spelling, this will break + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_mechanisms_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_modifications_filename: + mp_encoder = MultipartEncoder(fields={"submit": (None, "Download Modifications CV")}) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_cv_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_modifications_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_pathways_filename: + mp_encoder = MultipartEncoder(fields={'format': 'include SIGNOR entities', + 'submit': 'Download GMT File (all Pathways)' + }) + headers = {'Content-Type': mp_encoder.content_type} + response = rq.post(self.signor_pathways_download, headers=headers, data=mp_encoder) + with open(os.path.join(self.data_path, self.signor_pathways_filename), 'wb') as f: + f.write(response.content) + + elif source == self.signor_file_name: + data_puller.pull_via_http(self.signor_data_url, self.data_path) + + file_count += 1 + + self.make_datafile() + + return file_count + + def make_datafile(self): + """ + This function makes the data file which is a json file with all the SIGNOR data types laid out. This file can + then be used later to make SIGNOR type Entities to their respective information. + Also, this file can be modified to include additional information at a later date. (ie; mapped GO and HP terms) + """ + signordata = {} + + for file in self.data_files: + data_list = [] + unique_rows = [] + + if file in [self.signor_phenotypes_filename, self.signor_stimuli_filename]: + section = os.path.splitext(file)[0].split("-")[1] + + with open(os.path.join(self.data_path, file), 'r', newline='') as f: + reader = csv.reader(f, delimiter=';', quotechar='"') + next(reader) + header = ["SIGNOR ID", "NAME", "DESCRIPTION"] # next(reader) + + for row in reader: + if len(row) != len(header): + continue # Skip malformed rows + + row_dict = {header[i]: row[i] for i in range(len(header))} + + # Check if GO term is mentioned in the description. If so, add it to the dictionary + go_term_match = re.search(r'GO:(\d{7})', row_dict.get("DESCRIPTION", "")) + if go_term_match: + go_term = f"GO:{go_term_match.group(1)}" + row_dict['GO_TERM'] = go_term + + row_tuple = tuple((key, str(value)) for key, value in row_dict.items()) + + # Check if the row is unique + if row_tuple not in unique_rows: + unique_rows.append(row_tuple) + data_list.append(row_dict) + + signordata[section] = data_list + + elif file in [self.signor_modifications_filename, self.signor_mechanisms_filename]: + section = os.path.splitext(file)[0].split("-")[1] + + with open(os.path.join(self.data_path, file), 'r', newline='') as f: + reader = csv.reader(f, delimiter=';', quotechar='"') + next(reader) + header = ["NAME", "GO_TERM", "DEFINITION"] # next(reader) + + for row in reader: + if len(row) != len(header): + continue # Skip malformed rows + row_dict = {header[i]: row[i] for i in range(len(header))} + row_tuple = tuple((key, str(value)) for key, value in row_dict.items()) + + # Check if the row is unique + if row_tuple not in unique_rows: + unique_rows.append(row_tuple) + data_list.append(row_dict) + + signordata[section] = data_list + + elif file in [self.signor_complex_filename, self.signor_proteinfamily_filename]: + section = os.path.splitext(file)[0].split("-")[1] + + with open(os.path.join(self.data_path, file), 'r', newline='') as f: + reader = csv.reader(f, delimiter=';', quotechar='"') + header = ["SIGNOR ID", "NAME", "ENTITIES"] # next(reader) + + for row in reader: + if len(row) != len(header): + continue # Skip malformed rows + row_dict = {header[i]: row[i] for i in range(len(header))} + row_tuple = tuple((key, str(value)) for key, value in row_dict.items()) + + # Check if the row is unique + if row_tuple not in unique_rows: + unique_rows.append(row_tuple) + entities = row_dict["ENTITIES"].split(", ") + row_dict["ENTITIES"] = ["UniProtKB:" + uniprotid for uniprotid in entities] + data_list.append(row_dict) + + signordata[section] = data_list + else: + continue + # TODO the pathways file is structured similarly to the modifications and medchanisms file but + # instead of UniProt IDs its protein Names (ie BRAF vs P15056) is there a prefix to attach to these? + # do we want these in the json data file? There are no pathways in the data release file. + + # TODO this file probably needs a lot of work (long term work) to match the SIGNOR phenotypes, + # modifications and mechanisms to known CURIES: ie HP or REACTOME. For now, its usuable. + + # Write the list of dictionaries to a JSON file for reference during edge creation + with open(os.path.join(self.data_path, self.signor_data_file), mode='w') as outfile: + json.dump(signordata, outfile, indent=4) + + + def node_data_mapping(self, line): + + def get_node(database, identifier): + """adds the correct prefixes to the subject and objects""" + database_prefix_map = { + "UNIPROT": UNIPROTKB, + "PUBCHEM": PUBCHEM_COMPOUND, + "RNAcentral": RNACENTRAL, + "DRUGBANK": DRUGBANK + } + + if database == "PUBCHEM": + # Remove prefix from PUBCHEM IDs in SIGNOR + return f"{database_prefix_map.get(database)}:{identifier.replace('CID:', '')}" + + if database == "UNIPROT" and len(identifier.split("-PRO_")) > 1: + # Remove suffix from UNIPROT IDs in SIGNOR + # These suffixes are specific regions of the gene/protein and will be added to X_part_qualifier + return f"{database_prefix_map.get(database)}:{identifier.split('-PRO_')[0]}" + + node = f"{database_prefix_map.get(database)}:{identifier}" if database in database_prefix_map else None + return node + + def signor_node_mapping(type, identifier): + """maps the SIGNOR ID to the GO_TERM if the available.""" + with open(os.path.join(self.data_path, self.signor_data_file), 'r') as file: + data = json.load(file) + + # Search for the entry with the specified SIGNOR ID + for entry in data.get(type, []): + if entry.get("SIGNOR ID") == identifier: + go_term = entry.get("GO_TERM") + if go_term: + return go_term # Return the GO_TERM if found + + # Mapping for subject and object + subject_node = get_node(line[DATACOLS.SUBJECT_DATABASE.value], line[DATACOLS.SUBJECT_ID.value]) + object_node = get_node(line[DATACOLS.OBJECT_DATABASE.value], line[DATACOLS.OBJECT_ID.value]) + + # + if line[DATACOLS.SUBJECT_DATABASE.value] == "SIGNOR": + subject_node = signor_node_mapping(line[DATACOLS.SUBJECT_TYPE.value], line[DATACOLS.SUBJECT_ID.value]) + + if line[DATACOLS.OBJECT_DATABASE.value] == "SIGNOR": + object_node = signor_node_mapping(line[DATACOLS.OBJECT_TYPE.value], line[DATACOLS.OBJECT_ID.value]) + + return subject_node, object_node + + @staticmethod + def get_anatomical_context(line): + """ + gets the cell and tissue types and puts them in one list for ANATOMICAL_CONTEXT_QUALIFIER + """ + cell_types = line[DATACOLS.CELL_TYPE.value].split(";") if line[DATACOLS.CELL_TYPE.value] else [] + tissue_types = line[DATACOLS.TISSUE_TYPE.value].split(";") if line[DATACOLS.TISSUE_TYPE.value] else [] + return cell_types + tissue_types if cell_types or tissue_types else None + + @staticmethod + def get_taxon(line): + """ + gets the taxon id when available + returns None if taxon id is invalid + """ + taxon_value = line[DATACOLS.TAXON.value] + return [f"NCBITaxon:{taxon_value}"] if taxon_value not in ["", "-1"] else None + + @staticmethod + def get_part_qualifier(line): + """ + gets the part qualifier from the suffix of UNIPROT IDs + """ + def get_part(database, identifier): + if database == "UNIPROT" and len(identifier.split("-PRO_")) > 1: + return [identifier.split('-')[1]] + + subject_part_qualifier = get_part(line[DATACOLS.SUBJECT_DATABASE.value], line[DATACOLS.SUBJECT_ID.value]) + object_part_qualifier = get_part(line[DATACOLS.OBJECT_DATABASE.value], line[DATACOLS.OBJECT_ID.value]) + + return subject_part_qualifier, object_part_qualifier + + @staticmethod + def edge_properties_from_mechanism(line, effect, predicate): + """ + get the edge properties from the SIGNOR mechanisms/effects + """ + + # Handling post-translational modifications (PTMs) + if effect in ptm_dict: + if effect == "cleavage": + effect = "degradation" + + predicate = "biolink:affects" + direction_qualifier = ptm_dict[effect] + + return predicate, { + QUALIFIED_PREDICATE: "RO:0003303", # causes + OBJECT_DIRECTION_QUALIFIER: direction_qualifier, + OBJECT_ASPECT_QUALIFIER: effect, + OBJECT_PART_QUALIFIER: [line[DATACOLS.AA_MODIFIED.value]] if line[DATACOLS.AA_MODIFIED.value] else None + } + + # other mechanisms + predicate = mechanism_map.get(effect, {}).get("predicate", predicate) + edge_properties = mechanism_map.get(effect, {}).get("edge_properties", {}) + + return predicate, edge_properties + + def get_basic_edge_properties(self, line): + """ + define basic edge properties for all edges + """ + edge_properties = { + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: MANUAL_AGENT, + PUBLICATIONS: ['PMID:' + line[DATACOLS.PUBLICATIONS.value]], + DESCRIPTION: [line[DATACOLS.DESCRIPTION.value]], + SPECIES_CONTEXT_QUALIFIER: self.get_taxon(line), + ANATOMICAL_CONTEXT_QUALIFIER: self.get_anatomical_context(line), + SUBJECT_PART_QUALIFIER: self.get_part_qualifier(line)[0] if self.get_part_qualifier(line)[0] else None, + OBJECT_PART_QUALIFIER: self.get_part_qualifier(line)[1] if self.get_part_qualifier(line)[1] else None + } + return edge_properties + + def create_and_parse_edge(self, row, extractor, predicate="biolink:related_to", + edge_properties=None, mechanism=None): + """ + Creates predicates and edge properties for a row + based on the effects and mechanisms in SIGNOR + """ + + # Default Edge Properties + basic_edge_properties = self.get_basic_edge_properties(row) + + if mechanism: + predicate, mechanism_edge_properties = self.edge_properties_from_mechanism(row, mechanism, predicate) + # Add mechanism specific edge properties to the basic edge properties + edge_properties = basic_edge_properties | mechanism_edge_properties + + if edge_properties: + # Add basic edge properties to effect specific edge properties + edge_properties.update(basic_edge_properties) + else: + edge_properties = basic_edge_properties + + extractor.parse_row( + row, + subject_extractor=lambda line: self.node_data_mapping(line)[0], + object_extractor=lambda line: self.node_data_mapping(line)[1], + predicate_extractor=lambda line: predicate, + subject_property_extractor=None, + object_property_extractor=None, + edge_property_extractor=lambda line: edge_properties + ) + + return predicate, edge_properties + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + :return: ret_val: load_metadata + """ + + extractor = Extractor(file_writer=self.output_file_writer) + + with open(os.path.join(self.data_path, self.signor_file_name)) as csvfile: + reader = csv.reader(csvfile, delimiter='\t', quotechar='"') + next(reader) + + for row in reader: + + effect = row[DATACOLS.EFFECT.value] + mechanism = row[DATACOLS.MECHANISM.value] + + if effect in effect_mapping.keys(): + # Handle edge from mechanism + if mechanism: + self.create_and_parse_edge(row, extractor, mechanism=mechanism) + + for predicate in effect_mapping[effect].keys(): + edge_properties = effect_mapping[effect][predicate] + + # Final edge creation + if mechanism: + # Handle edge from mechanism + self.create_and_parse_edge(row, extractor, predicate=predicate, + edge_properties=edge_properties, mechanism=mechanism) + else: + self.create_and_parse_edge(row, extractor, predicate=predicate, + edge_properties=edge_properties) + # Handle unknown effect case + elif effect == "unknown" and mechanism: + self.create_and_parse_edge(row, extractor, mechanism=mechanism) + + else: + self.create_and_parse_edge(row, extractor) + + return extractor.load_metadata diff --git a/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py b/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py new file mode 100644 index 00000000..bfe8286b --- /dev/null +++ b/parsers/SIGNOR/src/signor_mechanism_predicate_mapping.py @@ -0,0 +1,223 @@ +from Common.biolink_constants import * + +ptm_dict = { + "acetylation": "increased", + "ADP-ribosylation": "increased", + "carboxylation": "increased", + "deacetylation": "decreased", + "degradation": "increased", # cleavage + "deglycosylation": "decreased", + "demethylation": "decreased", + "dephosphorylation": "decreased", + "desumoylation": "decreased", + "deubiquitination": "decreased", + "glycosylation": "increased", + "hydroxylation": "increased", + "lipidation": "increased", + "methylation": "increased", + "monoubiquitination": "increased", + "neddylation": "increased", + "oxidation": "increased", + "palmitoylation": "increased", + "phosphorylation": "increased", + "polyubiquitination": "increased", + "s-nitrosylation": "increased", + "sumoylation": "increased", + "trimethylation": "increased", + "tyrosination": "increased", + "ubiquitination": "increased" + } + +mechanism_map = { + "binding": { + "predicate": f"RO:0002436", # directly_physically_interacts_with + }, + + "catalytic activity": { + "predicate": f"RO:0002327", # catalyzes + }, + + "chemical activation": { + "edge_properties": { + CAUSAL_MECHANISM_QUALIFIER: "chemical activation" + } + }, + + "chemical inhibition": { + "edge_properties": { + CAUSAL_MECHANISM_QUALIFIER: "chemical inhibition" + } + }, + + "chemical modification": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: f"RO:0003303", # causes + OBJECT_ASPECT_QUALIFIER: "chemical modification", + } + }, + "destabilization": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_DIRECTION_QUALIFIER: "decreased", + OBJECT_ASPECT_QUALIFIER: "stability" + } + }, + + # This probably needs to be a node property. "is_a" + "gtpase - activating protein": { + }, + + # This probably needs to be a node property. + "guanine nucleotide exchange factor": { + }, + + "post transcriptional modification": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_ASPECT_QUALIFIER: "post transcriptional modification" + } + }, + + "post translational modification": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_ASPECT_QUALIFIER: "post translation modification" + } + }, + + # new predicate? + "precursor of": { + }, + + "relocalization": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_ASPECT_QUALIFIER: "relocation" + } + }, + + "small molecule catalysis": { + "predicate": "RO:0002327", # catalyses + }, + + "transcriptional regulation": { + "predicate": f"biolink:affects", + "edge_properties": { + QUALIFIED_PREDICATE: "RO:0003303", + CAUSAL_MECHANISM_QUALIFIER: "transcriptional_regulation" + } + }, + + "translation regulation": { + "predicate": f"biolink:affects", + "edge_properties": { + OBJECT_ASPECT_QUALIFIER: "translation" + } + }, + + } + +# Effect to Predicate Mapping +effect_mapping = { + "form complex": { + "biolink:in_complex_with": {}, + "RO:0002436": {} + }, + + "down-regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulates" + } + }, + + "down-regulates activity": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulates", + OBJECT_ASPECT_QUALIFIER: "activity" + } + }, + + "down-regulates quantity": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + } + }, + + "down-regulates quantity by destabilization": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + }, + + "biolink:affects": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_DIRECTION_QUALIFIER: "decreased", + OBJECT_ASPECT_QUALIFIER: "stability" + } + }, + + "down-regulates quantity by repression": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + }, + + "biolink:affects": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_DIRECTION_QUALIFIER: "decreased", + OBJECT_ASPECT_QUALIFIER: "expression" + } + }, + + "up-regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulates" + } + }, + + "up-regulates activity": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulates", + OBJECT_ASPECT_QUALIFIER: "activity" + } + }, + + "up-regulates quantity": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + } + }, + + "up-regulates quantity by stabilization": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + }, + + "biolink:affects": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_DIRECTION_QUALIFIER: "increased", + OBJECT_ASPECT_QUALIFIER: "stability" + } + }, + + "up-regulates quantity by expression": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulates", + OBJECT_ASPECT_QUALIFIER: "abundance" + }, + + "biolink:affects": { + QUALIFIED_PREDICATE: "RO:0003303", + OBJECT_DIRECTION_QUALIFIER: "increased", + OBJECT_ASPECT_QUALIFIER: "expression" + } + } +} \ No newline at end of file diff --git a/parsers/SIGNOR/src/signor_predicate_mapping.py b/parsers/SIGNOR/src/signor_predicate_mapping.py new file mode 100644 index 00000000..1fcff96d --- /dev/null +++ b/parsers/SIGNOR/src/signor_predicate_mapping.py @@ -0,0 +1,80 @@ +PREDICATE_MAPPING = { + "down-regulates":{ + "predicate":"biolink:regulates", + "qualified_predicate":"", + "object_direction_qualifier":"downregulated", + "object_aspect_qualifier":"", + }, + "up-regulates quantity":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"abundance", + }, + "down-regulates quantity by repression":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"expression", + }, + "up-regulates":{ + "predicate":"biolink:regulates", + "qualified_predicate":"", + "object_direction_qualifier":"upregulated", + "object_aspect_qualifier":"", + }, + "down-regulates quantity by destabilization":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"stability", + }, + "up-regulates quantity by expression":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"expression", + }, + "down-regulates quantity":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"abundance", + }, + "up-regulates activity":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"activity", + }, + "down-regulates activity":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"activity", + }, + "form complex":{ + "predicate":"biolink:in_complex_with", + "qualified_predicate":"", + "object_direction_qualifier":"", + "object_aspect_qualifier":"", + }, + "up-regulates quantity by stabilization":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"increased", + "object_aspect_qualifier":"stability", + }, + "down-regulates quantity by repression":{ + "predicate":"biolink:regulates", + "qualified_predicate":"biolink:causes", + "object_direction_qualifier":"decreased", + "object_aspect_qualifier":"expression", + }, + "unknown":{ + "predicate":"biolink:regulates", + "qualified_predicate":"", + "object_direction_qualifier":"", + "object_aspect_qualifier":"", + } +} \ No newline at end of file diff --git a/parsers/drugcentral/src/loaddrugcentral.py b/parsers/drugcentral/src/loaddrugcentral.py index bd828a85..50b56b8b 100644 --- a/parsers/drugcentral/src/loaddrugcentral.py +++ b/parsers/drugcentral/src/loaddrugcentral.py @@ -8,7 +8,7 @@ from Common.loader_interface import SourceDataLoader, SourceDataFailedError, SourceDataBrokenError from Common.utils import GetData, snakify from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS, \ - KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, MANUAL_AGENT + KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, MANUAL_AGENT, AFFINITY, AFFINITY_PARAMETER from Common.prefixes import DRUGCENTRAL, MEDDRA, UMLS, UNIPROTKB, PUBMED from Common.predicates import DGIDB_PREDICATE_MAPPING from Common.db_connectors import PostgresConnector @@ -186,8 +186,8 @@ def get_bioactivity_attributes(self, line): edge_props = {KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, AGENT_TYPE: MANUAL_AGENT} if line['act_type'] is not None: - edge_props['affinity'] = line['act_value'] - edge_props['affinityParameter'] = line['act_type'] + edge_props[AFFINITY] = line['act_value'] + edge_props[AFFINITY_PARAMETER] = f"p{line['act_type']}" if line['act_source'] == 'SCIENTIFIC LITERATURE' and line['act_source_url'] is not None: papersource = line['act_source_url'] if papersource.startswith('http://www.ncbi.nlm.nih.gov/pubmed'): diff --git a/parsers/hgnc/src/loadHGNC.py b/parsers/hgnc/src/loadHGNC.py index 9eee0d56..09a0df57 100644 --- a/parsers/hgnc/src/loadHGNC.py +++ b/parsers/hgnc/src/loadHGNC.py @@ -1,6 +1,7 @@ import argparse import csv import os +import requests from Common.utils import GetData from Common.loader_interface import SourceDataLoader @@ -10,7 +11,7 @@ ############## -# Class: HGNC metabolites loader +# Class: HGNC loader # # By: Phil Owen # Date: 3/31/2021 @@ -21,10 +22,10 @@ class HGNCLoader(SourceDataLoader): source_id: str = HGNC provenance_id: str = 'infores:hgnc' description = "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information." - source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/" + source_data_url = "https://www.genenames.org/download/archive/" license = "https://www.genenames.org/about/" attribution = "https://www.genenames.org/about/" - parsing_version: str = '1.2' + parsing_version: str = '1.3' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -32,17 +33,10 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): :param source_data_dir - the specific storage directory to save files in """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - + self.source_db = 'HUGO Gene Nomenclature Committee' self.complete_set_file_name = 'hgnc_complete_set.txt' - # self.gene_groups_file_name = 'hgnc_genes_in_groups.txt' - self.data_files: list = [self.complete_set_file_name, - # self.gene_groups_file_name - ] - self.test_mode: bool = test_mode - self.source_db: str = 'HUGO Gene Nomenclature Committee' - - self.ftp_site = 'ftp.ebi.ac.uk' - self.ftp_dir = '/pub/databases/genenames/hgnc/tsv/' + self.data_file = self.complete_set_file_name + self.data_url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/" def get_latest_source_version(self) -> str: """ @@ -50,27 +44,25 @@ def get_latest_source_version(self) -> str: :return: the data version """ - data_puller = GetData() - # HGNC files change very frequently, excluding the day makes sure we only update it once per month - data_file_date = data_puller.get_ftp_file_date(self.ftp_site, - self.ftp_dir, - self.data_files[0], - exclude_day=True) - return data_file_date + headers = {"Accept": "application/json"} + info_response = requests.get('https://www.genenames.org/rest/info', headers=headers) + if info_response.ok: + info_json = info_response.json() + modified_date = info_json['lastModified'] + latest_version = modified_date.split('T')[0] + return latest_version + else: + info_response.raise_for_status() def get_data(self) -> int: """ Gets the HGNC data from two sources. """ - # get a reference to the data gathering class - gd: GetData = GetData(self.logger.level) - file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path) - - # get the gene groups dataset - # byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path) - - return file_count + gd: GetData = GetData() + data_file_url = self.data_url + self.data_file + gd.pull_via_http(url=data_file_url, data_dir=self.data_path) + return True def parse_data(self) -> dict: """ diff --git a/parsers/monarchkg/src/loadMonarchKG.py b/parsers/monarchkg/src/loadMonarchKG.py index 2417cb2f..73b4ee10 100644 --- a/parsers/monarchkg/src/loadMonarchKG.py +++ b/parsers/monarchkg/src/loadMonarchKG.py @@ -2,11 +2,12 @@ import os import tarfile import orjson +import requests from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxedge from Common.biolink_constants import * -from Common.utils import GetData +from Common.utils import GetData, GetDataPullError ############## @@ -29,7 +30,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): # there is a /latest/ for this url, but without a valid get_latest_source_version function, # it could create a mismatch, pin to this version for now - self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/2024-03-18/' + self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/latest/' self.monarch_graph_archive = 'monarch-kg.jsonl.tar.gz' self.monarch_edge_file_archive_path = 'monarch-kg_edges.jsonl' self.data_files = [self.monarch_graph_archive] @@ -63,9 +64,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): } def get_latest_source_version(self) -> str: - # possible to retrieve from /latest/index.html with beautifulsoup or some html parser but not ideal, - # planning to try to set up a better method with owners - latest_version = '2024-03-18' + """ + Gets the name of latest monarch kg version from metadata. + """ + latest_version = None + try: + metadata_yaml : requests.Response = requests.get("https://data.monarchinitiative.org/monarch-kg-dev/latest/metadata.yaml") + for line in metadata_yaml.text.split('\n'): + if("kg-version:" in line): latest_version = line.replace("kg-version:","").strip() + if(latest_version==None):raise ValueError("Cannot find 'kg-version' in Monarch KG metadata yaml.") + except Exception as e: + raise GetDataPullError(error_message=f'Unable to determine latest version for Monarch KG: {e}') return latest_version def get_data(self) -> bool: @@ -85,6 +94,10 @@ def parse_data(self) -> dict: skipped_ignore_knowledge_source = 0 skipped_undesired_predicate = 0 full_tar_path = os.path.join(self.data_path, self.monarch_graph_archive) + protected_edge_labels = [SUBJECT_ID, OBJECT_ID, PREDICATE,PRIMARY_KNOWLEDGE_SOURCE, + AGGREGATOR_KNOWLEDGE_SOURCES, KNOWLEDGE_LEVEL, AGENT_TYPE, + PUBLICATIONS, "biolink:primary_knowledge_source", "biolink:aggregator_knowledge_source"] + with tarfile.open(full_tar_path, 'r') as tar_files: with tar_files.extractfile(self.monarch_edge_file_archive_path) as edges_file: for line in edges_file: @@ -116,13 +129,14 @@ def parse_data(self) -> dict: KNOWLEDGE_LEVEL: monarch_edge[KNOWLEDGE_LEVEL] if KNOWLEDGE_LEVEL in monarch_edge else NOT_PROVIDED, AGENT_TYPE: monarch_edge[AGENT_TYPE] if AGENT_TYPE in monarch_edge else NOT_PROVIDED } + if monarch_edge[PUBLICATIONS]: edge_properties[PUBLICATIONS] = monarch_edge[PUBLICATIONS] + for edge_attribute in monarch_edge: - if '_qualifier' in edge_attribute and monarch_edge[edge_attribute]: + if edge_attribute not in protected_edge_labels and monarch_edge[edge_attribute]: edge_properties[edge_attribute] = monarch_edge[edge_attribute] - elif edge_attribute == QUALIFIED_PREDICATE and monarch_edge[QUALIFIED_PREDICATE]: - edge_properties[QUALIFIED_PREDICATE] = monarch_edge[QUALIFIED_PREDICATE] + output_edge = kgxedge( subject_id=subject_id, predicate=predicate, diff --git a/requirements.txt b/requirements.txt index 519dcb40..03a7ca86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pandas==2.2.2 requests==2.32.3 +requests-toolbelt>=1.0.0 pytest==8.2.0 robokop-genetics==0.5.0 # intermine is on pypi but as of 6/23 it's broken for python 3.10+, this fork fixes the issue @@ -10,9 +11,9 @@ beautifulsoup4==4.12.3 psycopg2-binary==2.9.9 orjson==3.10.3 xxhash==3.4.1 -mysql-connector-python==8.4.0 +mysql-connector-python==9.1.0 neo4j==5.20.0 pyoxigraph==0.3.22 curies==0.7.9 prefixmaps==0.2.4 -bmt==1.4.1 +bmt==1.4.1 \ No newline at end of file