Merge pull request #188 from RobokopU24/parser_and_normalization_updates

Parser and normalization updates
RobokopU24 · Oct 27, 2023 · 79a8241 · 79a8241
2 parents 6167ba6 + 2cd4ed7
commit 79a8241
Show file tree

Hide file tree

Showing 16 changed files with 369 additions and 74 deletions.
diff --git a/Common/data_sources.py b/Common/data_sources.py
@@ -5,10 +5,10 @@
 BIOLINK = 'Biolink'
 CAM_KP = 'CAM-KP'
 CHEBI_PROPERTIES = 'CHEBIProps'
-REACTOME = 'Reactome'
 CORD19 = 'Cord19'
 CTD = 'CTD'
 DRUG_CENTRAL = 'DrugCentral'
+DRUGMECHDB = 'DrugMechDB'
 # FOODB = 'FooDB' # this is on hold, data needs review after latest release of data.
 GENOME_ALLIANCE_ORTHOLOGS = 'GenomeAllianceOrthologs'
 GTEX = 'GTEx'
@@ -24,6 +24,7 @@
 PANTHER = 'PANTHER'
 PHAROS = 'PHAROS'
 PLANT_GOA = 'PlantGOA'
+REACTOME = 'Reactome'
 SCENT = 'Scent'
 SGD = 'SGD'
 HUMAN_STRING = 'STRING-DB-Human'
@@ -39,17 +40,17 @@
 YEAST_STRING = 'STRING-DB-Yeast'
 
 RESOURCE_HOGS = [GTEX, GWAS_CATALOG, UNIREF, ONTOLOGICAL_HIERARCHY, UBERGRAPH_REDUNDANT,
-                 SGD, HUMAN_STRING, CAM_KP]
+                 SGD, HUMAN_STRING]
 
 SOURCE_DATA_LOADER_CLASS_IMPORTS = {
     BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"),
     BIOLINK: ("parsers.biolink.src.loadBL", "BLLoader"),
     CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"),
     CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"),
-    REACTOME: ("parsers.Reactome.src.loadReactome", "ReactomeLoader"),
     CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"),
     CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"),
     DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"),
+    DRUGMECHDB: ("parsers.drugmechdb.src.loadDrugMechDB", "DrugMechDBLoader"),
     GENOME_ALLIANCE_ORTHOLOGS: ("parsers.GenomeAlliance.src.loadGenomeAlliance", "GenomeAllianceOrthologLoader"),
     GTEX: ("parsers.GTEx.src.loadGTEx", "GTExLoader"),
     GTOPDB: ("parsers.gtopdb.src.loadGtoPdb", "GtoPdbLoader"),
@@ -65,6 +66,7 @@
     PANTHER: ("parsers.panther.src.loadPanther", "PLoader"),
     PHAROS: ("parsers.PHAROS.src.loadPHAROS", "PHAROSLoader"),
     PLANT_GOA: ("parsers.GOA.src.loadGOA", "PlantGOALoader"),
+    REACTOME: ("parsers.Reactome.src.loadReactome", "ReactomeLoader"),
     SCENT: ("parsers.scent.src.loadScent", "ScentLoader"),
     SGD: ("parsers.SGD.src.loadSGD", "SGDLoader"),
     TEXT_MINING_KP: ("parsers.textminingkp.src.loadTMKP", "TMKPLoader"),

diff --git a/Common/kgx_file_normalizer.py b/Common/kgx_file_normalizer.py
@@ -144,7 +144,11 @@ def normalize_node_file(self):
                     regular_nodes_pre_norm += len(regular_nodes)
                     if regular_nodes:
                         self.logger.debug(f'Normalizing {len(regular_nodes)} regular nodes...')
-                        self.node_normalizer.normalize_node_data(regular_nodes)
+                        try:
+                            self.node_normalizer.normalize_node_data(regular_nodes)
+                        except Exception as e:
+                            raise NormalizationFailedError(error_message='Error during node normalization.',
+                                                           actual_error=e)
                     regular_nodes_post_norm += len(regular_nodes)
                     if regular_nodes:
                         self.logger.info(f'Normalized {regular_nodes_pre_norm} regular nodes so far...')

diff --git a/Common/meta_kg.py b/Common/meta_kg.py
@@ -26,8 +26,7 @@ def __init__(self,
                  nodes_file_path: str,
                  edges_file_path: str,
                  logger=None):
-        if logger:
-            self.logger = logger
+        self.logger = logger
         self.bl_utils = BiolinkUtils()
 
         self.node_id_to_leaf_types = None
@@ -62,7 +61,16 @@ def analyze_nodes(self, nodes_file_path: str):
         for node in quick_jsonl_file_iterator(nodes_file_path):
 
             # find the leaf node types of this node's types according to the biolink model, if not done already
-            node_types = frozenset(node[NODE_TYPES])
+            try:
+                node_types = frozenset(node[NODE_TYPES])
+            except TypeError as e:
+                error_message = f'Node types were not a valid list for node: {node}'
+                if self.logger:
+                    self.logger.error(error_message)
+                else:
+                    print(error_message)
+                node_types = frozenset()
+
             if node_types not in node_types_to_leaves:
                 leaf_types = self.bl_utils.find_biolink_leaves(node_types)
                 node_types_to_leaves[node_types] = leaf_types
@@ -108,9 +116,12 @@ def analyze_edges(self, edges_file_path: str):
                 subject_types = node_id_to_leaf_types[edge[SUBJECT_ID]]
                 object_types = node_id_to_leaf_types[edge[OBJECT_ID]]
             except KeyError as e:
+                error_message = f'Leaf node types not found for node: {e}. '\
+                                f'Make sure the node is present in the nodes file.'
                 if self.logger:
-                    self.logger.error(f'Leaf node types not found for a node on edge {json.dumps(edge)}. '
-                                      f'Make sure the nodes are present in the nodes file. KeyError: {e}')
+                    self.logger.error(error_message)
+                else:
+                    print(error_message)
                 continue
 
             non_core_edge_attributes = [key for key in edge.keys()
@@ -136,7 +147,15 @@ def analyze_edges(self, edges_file_path: str):
                     edge_type_key = f'{subject_type}{object_type}{predicate}'
                     edge_type_key_to_attributes[edge_type_key].update(edge_attributes)
                     for qual, qual_val in edge_qualifier_values.items():
-                        edge_type_key_to_qualifiers[edge_type_key][qual].add(qual_val)
+                        try:
+                            edge_type_key_to_qualifiers[edge_type_key][qual].add(qual_val)
+                        except TypeError as e:
+                            error_message = f'Type of value for qualifier not expected: {qual}: {qual_val}, '\
+                                            f'ignoring for meta kg. Error: {e}'
+                            if self.logger:
+                                self.logger.warning(error_message)
+                            else:
+                                print(error_message)
 
                     if edge_type_key not in edge_type_key_to_example:
                         example_edge = {

diff --git a/Common/node_types.py b/Common/node_types.py
@@ -67,6 +67,7 @@
 NODE_TYPES = 'category'
 SYNONYMS = 'equivalent_identifiers'
 INFORMATION_CONTENT = 'information_content'
+DESCRIPTION = 'description'
 
 FALLBACK_EDGE_PREDICATE = 'biolink:related_to'
 

diff --git a/Common/normalization.py b/Common/normalization.py
@@ -114,7 +114,8 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list:
                 # get the data
                 resp: requests.models.Response = requests.post(f'{self.node_norm_endpoint}get_normalized_nodes',
                                                                json={'curies': data_chunk,
-                                                                     'conflate': self.conflate_node_types})
+                                                                     'conflate': self.conflate_node_types,
+                                                                     'description': True})
 
                 # did we get a good status code
                 if resp.status_code == 200:
@@ -198,17 +199,23 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list:
             current_node_normalization = cached_node_norms[current_node_id]
             if current_node_normalization is not None:
 
+                current_node_id_section = current_node_normalization['id']
+
                 # update the node with the normalized info
-                normalized_id = current_node_normalization['id']['identifier']
+                normalized_id = current_node_id_section['identifier']
                 current_node['id'] = normalized_id
                 current_node[NODE_TYPES] = current_node_normalization['type']
                 current_node[SYNONYMS] = list(item['identifier'] for item in current_node_normalization[SYNONYMS])
-                if INFORMATION_CONTENT in current_node_normalization:
-                    current_node[INFORMATION_CONTENT] = current_node_normalization[INFORMATION_CONTENT]
 
-                # set the name as the label if it exists
-                if 'label' in current_node_normalization['id']:
-                    current_node['name'] = current_node_normalization['id']['label']
+                # set the name as the primary label if it exists
+                if 'label' in current_node_id_section:
+                    current_node['name'] = current_node_id_section['label']
+
+                # set the node description and/or information content if they are present
+                if 'information_content' in current_node_normalization:
+                    current_node[INFORMATION_CONTENT] = current_node_normalization[INFORMATION_CONTENT]
+                if 'description' in current_node_id_section:
+                    current_node[DESCRIPTION] = current_node_id_section['description']
 
                 self.node_normalization_lookup[current_node_id] = [normalized_id]
             else:

diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml
@@ -56,9 +56,6 @@ graphs:
     sources:
         # using an older version of GWASCatalog because EFO terms are not normalizing right now
       - source_id: GWASCatalog
-        source_version: '8_23_2022'
-        edge_normalization_version: 'v.2.4.4'
-        node_normalization_version: '2.0.9'
       - source_id: GTEx
 
   - graph_id: BINDING_Automat
@@ -157,9 +154,6 @@ graphs:
     output_format: neo4j
     sources:
       - source_id: GWASCatalog
-        source_version: '8_23_2022'
-        edge_normalization_version: 'v.2.4.4'
-        node_normalization_version: '2.0.9'
       - source_id: OntologicalHierarchy
         merge_strategy: connected_edge_subset
 
@@ -249,7 +243,7 @@ graphs:
     graph_url: https://string-db.org/
     output_format: neo4j
     sources:
-      - source_id: STRING-DB
+      - source_id: STRING-DB-Human
       - source_id: OntologicalHierarchy
         merge_strategy: connected_edge_subset
 

diff --git a/helm/data-services/renci-values.yaml b/helm/data-services/renci-values.yaml
@@ -13,7 +13,7 @@ dataServices:
   image:
     repository: ghcr.io/robokopu24/orion
     pullPolicy: IfNotPresent
-    tag: v1.0.7-2
+    tag: v1.0.9
   graphsVolume:
     use_nfs: true
     nfs_server: na-projects.edc.renci.org
@@ -46,7 +46,7 @@ dataServices:
   normalization:
     nodeNormEndpoint: https://nodenormalization-sri.renci.org/
     edgeNormEndpoint: https://bl-lookup-sri.renci.org/
-  outputURL: https://stars.renci.org/var/plater/bl-3.2.1/
+  outputURL: https://stars.renci.org/var/plater/bl-3.5.4/
 
 pharos:
   host: pod-host-or-ip

diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py
@@ -50,7 +50,7 @@ class BINDINGDBLoader(SourceDataLoader):
     source_data_url = "https://www.bindingdb.org/rwd/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes"
     license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'"
     attribution = 'https://www.bindingdb.org/rwd/bind/info.jsp'
-    parsing_version = '1.2'
+    parsing_version = '1.3'
 
     def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """

diff --git a/parsers/PHAROS/src/loadPHAROS.py b/parsers/PHAROS/src/loadPHAROS.py
@@ -19,7 +19,7 @@ class PHAROSLoader(SourceDataLoader):
     source_data_url = "https://pharos.nih.gov/"
     license = "Data accessed from Pharos and TCRD is publicly available from the primary sources listed above. Please respect their individual licenses regarding proper use and redistribution."
     attribution = 'Sheils, T., Mathias, S. et al, "TCRD and Pharos 2021: mining the human proteome for disease biology", Nucl. Acids Res., 2021. DOI: 10.1093/nar/gkaa993'
-    parsing_version: str = '1.4'
+    parsing_version: str = '1.5'
 
     GENE_TO_DISEASE_QUERY: str = """select distinct x.value, d.did, d.name, p.sym, d.dtype, d.score
                                 from disease d 
@@ -100,16 +100,15 @@ def parse_data(self) -> dict:
 
         :return: parsed meta data results
         """
-
         if self.ping_pharos_db():
             self.logger.info('Pinging PHAROS database successful..')
         else:
             error_message = "PHAROS DB was not accessible. " \
                             "Manually stand up PHAROS DB and configure environment variables before trying again."
             raise SourceDataFailedError(error_message=error_message)
 
-        final_record_count: int = 0
-        final_skipped_count: int = 0
+        final_record_count = 0
+        final_skipped_count = 0
 
         # get the nodes and edges for each dataset
         self.logger.info('Querying for gene to disease..')
@@ -141,11 +140,8 @@ def parse_data(self) -> dict:
 
     def parse_gene_to_disease(self) -> (int, int):
         """
-        gets gene to disease records from the pharos DB and creates nodes
-        :param node_list: list, the node list to append this data to
-        :return: list, the node list and record counters
+        gets gene to disease records from the pharos DB and creates nodes and edges
         """
-        # init the record counters
         record_counter: int = 0
         skipped_record_counter: int = 0
 
@@ -210,11 +206,9 @@ def parse_gene_to_disease(self) -> (int, int):
 
     def parse_gene_to_drug_activity(self) -> (int, int):
         """
-        gets gene to drug activity records from the pharos DB and creates nodes
-        :param node_list: list, the node list to append this data to
+        gets gene to drug activity records from the pharos DB and creates nodes and edges
         :return: list, the node list and record counters
         """
-        # init the record counters
         record_counter: int = 0
         skipped_record_counter: int = 0
 
@@ -269,11 +263,9 @@ def parse_gene_to_drug_activity(self) -> (int, int):
 
     def parse_gene_to_cmpd_activity(self) -> (int, int):
         """
-        gets gene to compound activity records from the pharos DB and creates nodes
-        :param node_list: list, the node list to append this data to
+        gets gene to compound activity records from the pharos DB and creates nodes and edges
         :return: list, the node list and record counters
         """
-        # init the record counters
         record_counter: int = 0
         skipped_record_counter: int = 0
 
@@ -356,9 +348,13 @@ def get_edge_props(self, result) -> (str, list, dict, str):
 
         # if there was affinity data save it
         props: dict = {}
-        if result['affinity'] is not None:
-            props['affinity'] = float(result['affinity'])
-            props['affinity_parameter'] = result['affinity_parameter']
+        affinity = result['affinity']
+        if affinity is not None and affinity != '':
+            props['affinity'] = float(affinity)
+
+        affinity_paramater = result['affinity_parameter']
+        if affinity_paramater:
+            props['affinity_parameter'] = f'p{result["affinity_parameter"]}'
 
         # return to the caller
         return predicate, pmids, props, provenance