Skip to content

Commit

Permalink
Merge pull request #188 from RobokopU24/parser_and_normalization_updates
Browse files Browse the repository at this point in the history
Parser and normalization updates
  • Loading branch information
beasleyjonm authored Oct 27, 2023
2 parents 6167ba6 + 2cd4ed7 commit 79a8241
Show file tree
Hide file tree
Showing 16 changed files with 369 additions and 74 deletions.
8 changes: 5 additions & 3 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
BIOLINK = 'Biolink'
CAM_KP = 'CAM-KP'
CHEBI_PROPERTIES = 'CHEBIProps'
REACTOME = 'Reactome'
CORD19 = 'Cord19'
CTD = 'CTD'
DRUG_CENTRAL = 'DrugCentral'
DRUGMECHDB = 'DrugMechDB'
# FOODB = 'FooDB' # this is on hold, data needs review after latest release of data.
GENOME_ALLIANCE_ORTHOLOGS = 'GenomeAllianceOrthologs'
GTEX = 'GTEx'
Expand All @@ -24,6 +24,7 @@
PANTHER = 'PANTHER'
PHAROS = 'PHAROS'
PLANT_GOA = 'PlantGOA'
REACTOME = 'Reactome'
SCENT = 'Scent'
SGD = 'SGD'
HUMAN_STRING = 'STRING-DB-Human'
Expand All @@ -39,17 +40,17 @@
YEAST_STRING = 'STRING-DB-Yeast'

RESOURCE_HOGS = [GTEX, GWAS_CATALOG, UNIREF, ONTOLOGICAL_HIERARCHY, UBERGRAPH_REDUNDANT,
SGD, HUMAN_STRING, CAM_KP]
SGD, HUMAN_STRING]

SOURCE_DATA_LOADER_CLASS_IMPORTS = {
BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"),
BIOLINK: ("parsers.biolink.src.loadBL", "BLLoader"),
CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"),
CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"),
REACTOME: ("parsers.Reactome.src.loadReactome", "ReactomeLoader"),
CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"),
CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"),
DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"),
DRUGMECHDB: ("parsers.drugmechdb.src.loadDrugMechDB", "DrugMechDBLoader"),
GENOME_ALLIANCE_ORTHOLOGS: ("parsers.GenomeAlliance.src.loadGenomeAlliance", "GenomeAllianceOrthologLoader"),
GTEX: ("parsers.GTEx.src.loadGTEx", "GTExLoader"),
GTOPDB: ("parsers.gtopdb.src.loadGtoPdb", "GtoPdbLoader"),
Expand All @@ -65,6 +66,7 @@
PANTHER: ("parsers.panther.src.loadPanther", "PLoader"),
PHAROS: ("parsers.PHAROS.src.loadPHAROS", "PHAROSLoader"),
PLANT_GOA: ("parsers.GOA.src.loadGOA", "PlantGOALoader"),
REACTOME: ("parsers.Reactome.src.loadReactome", "ReactomeLoader"),
SCENT: ("parsers.scent.src.loadScent", "ScentLoader"),
SGD: ("parsers.SGD.src.loadSGD", "SGDLoader"),
TEXT_MINING_KP: ("parsers.textminingkp.src.loadTMKP", "TMKPLoader"),
Expand Down
6 changes: 5 additions & 1 deletion Common/kgx_file_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ def normalize_node_file(self):
regular_nodes_pre_norm += len(regular_nodes)
if regular_nodes:
self.logger.debug(f'Normalizing {len(regular_nodes)} regular nodes...')
self.node_normalizer.normalize_node_data(regular_nodes)
try:
self.node_normalizer.normalize_node_data(regular_nodes)
except Exception as e:
raise NormalizationFailedError(error_message='Error during node normalization.',
actual_error=e)
regular_nodes_post_norm += len(regular_nodes)
if regular_nodes:
self.logger.info(f'Normalized {regular_nodes_pre_norm} regular nodes so far...')
Expand Down
31 changes: 25 additions & 6 deletions Common/meta_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def __init__(self,
nodes_file_path: str,
edges_file_path: str,
logger=None):
if logger:
self.logger = logger
self.logger = logger
self.bl_utils = BiolinkUtils()

self.node_id_to_leaf_types = None
Expand Down Expand Up @@ -62,7 +61,16 @@ def analyze_nodes(self, nodes_file_path: str):
for node in quick_jsonl_file_iterator(nodes_file_path):

# find the leaf node types of this node's types according to the biolink model, if not done already
node_types = frozenset(node[NODE_TYPES])
try:
node_types = frozenset(node[NODE_TYPES])
except TypeError as e:
error_message = f'Node types were not a valid list for node: {node}'
if self.logger:
self.logger.error(error_message)
else:
print(error_message)
node_types = frozenset()

if node_types not in node_types_to_leaves:
leaf_types = self.bl_utils.find_biolink_leaves(node_types)
node_types_to_leaves[node_types] = leaf_types
Expand Down Expand Up @@ -108,9 +116,12 @@ def analyze_edges(self, edges_file_path: str):
subject_types = node_id_to_leaf_types[edge[SUBJECT_ID]]
object_types = node_id_to_leaf_types[edge[OBJECT_ID]]
except KeyError as e:
error_message = f'Leaf node types not found for node: {e}. '\
f'Make sure the node is present in the nodes file.'
if self.logger:
self.logger.error(f'Leaf node types not found for a node on edge {json.dumps(edge)}. '
f'Make sure the nodes are present in the nodes file. KeyError: {e}')
self.logger.error(error_message)
else:
print(error_message)
continue

non_core_edge_attributes = [key for key in edge.keys()
Expand All @@ -136,7 +147,15 @@ def analyze_edges(self, edges_file_path: str):
edge_type_key = f'{subject_type}{object_type}{predicate}'
edge_type_key_to_attributes[edge_type_key].update(edge_attributes)
for qual, qual_val in edge_qualifier_values.items():
edge_type_key_to_qualifiers[edge_type_key][qual].add(qual_val)
try:
edge_type_key_to_qualifiers[edge_type_key][qual].add(qual_val)
except TypeError as e:
error_message = f'Type of value for qualifier not expected: {qual}: {qual_val}, '\
f'ignoring for meta kg. Error: {e}'
if self.logger:
self.logger.warning(error_message)
else:
print(error_message)

if edge_type_key not in edge_type_key_to_example:
example_edge = {
Expand Down
1 change: 1 addition & 0 deletions Common/node_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
NODE_TYPES = 'category'
SYNONYMS = 'equivalent_identifiers'
INFORMATION_CONTENT = 'information_content'
DESCRIPTION = 'description'

FALLBACK_EDGE_PREDICATE = 'biolink:related_to'

Expand Down
21 changes: 14 additions & 7 deletions Common/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list:
# get the data
resp: requests.models.Response = requests.post(f'{self.node_norm_endpoint}get_normalized_nodes',
json={'curies': data_chunk,
'conflate': self.conflate_node_types})
'conflate': self.conflate_node_types,
'description': True})

# did we get a good status code
if resp.status_code == 200:
Expand Down Expand Up @@ -198,17 +199,23 @@ def normalize_node_data(self, node_list: list, block_size: int = 1000) -> list:
current_node_normalization = cached_node_norms[current_node_id]
if current_node_normalization is not None:

current_node_id_section = current_node_normalization['id']

# update the node with the normalized info
normalized_id = current_node_normalization['id']['identifier']
normalized_id = current_node_id_section['identifier']
current_node['id'] = normalized_id
current_node[NODE_TYPES] = current_node_normalization['type']
current_node[SYNONYMS] = list(item['identifier'] for item in current_node_normalization[SYNONYMS])
if INFORMATION_CONTENT in current_node_normalization:
current_node[INFORMATION_CONTENT] = current_node_normalization[INFORMATION_CONTENT]

# set the name as the label if it exists
if 'label' in current_node_normalization['id']:
current_node['name'] = current_node_normalization['id']['label']
# set the name as the primary label if it exists
if 'label' in current_node_id_section:
current_node['name'] = current_node_id_section['label']

# set the node description and/or information content if they are present
if 'information_content' in current_node_normalization:
current_node[INFORMATION_CONTENT] = current_node_normalization[INFORMATION_CONTENT]
if 'description' in current_node_id_section:
current_node[DESCRIPTION] = current_node_id_section['description']

self.node_normalization_lookup[current_node_id] = [normalized_id]
else:
Expand Down
8 changes: 1 addition & 7 deletions graph_specs/default-graph-spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ graphs:
sources:
# using an older version of GWASCatalog because EFO terms are not normalizing right now
- source_id: GWASCatalog
source_version: '8_23_2022'
edge_normalization_version: 'v.2.4.4'
node_normalization_version: '2.0.9'
- source_id: GTEx

- graph_id: BINDING_Automat
Expand Down Expand Up @@ -157,9 +154,6 @@ graphs:
output_format: neo4j
sources:
- source_id: GWASCatalog
source_version: '8_23_2022'
edge_normalization_version: 'v.2.4.4'
node_normalization_version: '2.0.9'
- source_id: OntologicalHierarchy
merge_strategy: connected_edge_subset

Expand Down Expand Up @@ -249,7 +243,7 @@ graphs:
graph_url: https://string-db.org/
output_format: neo4j
sources:
- source_id: STRING-DB
- source_id: STRING-DB-Human
- source_id: OntologicalHierarchy
merge_strategy: connected_edge_subset

Expand Down
4 changes: 2 additions & 2 deletions helm/data-services/renci-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dataServices:
image:
repository: ghcr.io/robokopu24/orion
pullPolicy: IfNotPresent
tag: v1.0.7-2
tag: v1.0.9
graphsVolume:
use_nfs: true
nfs_server: na-projects.edc.renci.org
Expand Down Expand Up @@ -46,7 +46,7 @@ dataServices:
normalization:
nodeNormEndpoint: https://nodenormalization-sri.renci.org/
edgeNormEndpoint: https://bl-lookup-sri.renci.org/
outputURL: https://stars.renci.org/var/plater/bl-3.2.1/
outputURL: https://stars.renci.org/var/plater/bl-3.5.4/

pharos:
host: pod-host-or-ip
Expand Down
2 changes: 1 addition & 1 deletion parsers/BINDING/src/loadBINDINGDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class BINDINGDBLoader(SourceDataLoader):
source_data_url = "https://www.bindingdb.org/rwd/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes"
license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'"
attribution = 'https://www.bindingdb.org/rwd/bind/info.jsp'
parsing_version = '1.2'
parsing_version = '1.3'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
Expand Down
30 changes: 13 additions & 17 deletions parsers/PHAROS/src/loadPHAROS.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class PHAROSLoader(SourceDataLoader):
source_data_url = "https://pharos.nih.gov/"
license = "Data accessed from Pharos and TCRD is publicly available from the primary sources listed above. Please respect their individual licenses regarding proper use and redistribution."
attribution = 'Sheils, T., Mathias, S. et al, "TCRD and Pharos 2021: mining the human proteome for disease biology", Nucl. Acids Res., 2021. DOI: 10.1093/nar/gkaa993'
parsing_version: str = '1.4'
parsing_version: str = '1.5'

GENE_TO_DISEASE_QUERY: str = """select distinct x.value, d.did, d.name, p.sym, d.dtype, d.score
from disease d
Expand Down Expand Up @@ -100,16 +100,15 @@ def parse_data(self) -> dict:
:return: parsed meta data results
"""

if self.ping_pharos_db():
self.logger.info('Pinging PHAROS database successful..')
else:
error_message = "PHAROS DB was not accessible. " \
"Manually stand up PHAROS DB and configure environment variables before trying again."
raise SourceDataFailedError(error_message=error_message)

final_record_count: int = 0
final_skipped_count: int = 0
final_record_count = 0
final_skipped_count = 0

# get the nodes and edges for each dataset
self.logger.info('Querying for gene to disease..')
Expand Down Expand Up @@ -141,11 +140,8 @@ def parse_data(self) -> dict:

def parse_gene_to_disease(self) -> (int, int):
"""
gets gene to disease records from the pharos DB and creates nodes
:param node_list: list, the node list to append this data to
:return: list, the node list and record counters
gets gene to disease records from the pharos DB and creates nodes and edges
"""
# init the record counters
record_counter: int = 0
skipped_record_counter: int = 0

Expand Down Expand Up @@ -210,11 +206,9 @@ def parse_gene_to_disease(self) -> (int, int):

def parse_gene_to_drug_activity(self) -> (int, int):
"""
gets gene to drug activity records from the pharos DB and creates nodes
:param node_list: list, the node list to append this data to
gets gene to drug activity records from the pharos DB and creates nodes and edges
:return: list, the node list and record counters
"""
# init the record counters
record_counter: int = 0
skipped_record_counter: int = 0

Expand Down Expand Up @@ -269,11 +263,9 @@ def parse_gene_to_drug_activity(self) -> (int, int):

def parse_gene_to_cmpd_activity(self) -> (int, int):
"""
gets gene to compound activity records from the pharos DB and creates nodes
:param node_list: list, the node list to append this data to
gets gene to compound activity records from the pharos DB and creates nodes and edges
:return: list, the node list and record counters
"""
# init the record counters
record_counter: int = 0
skipped_record_counter: int = 0

Expand Down Expand Up @@ -356,9 +348,13 @@ def get_edge_props(self, result) -> (str, list, dict, str):

# if there was affinity data save it
props: dict = {}
if result['affinity'] is not None:
props['affinity'] = float(result['affinity'])
props['affinity_parameter'] = result['affinity_parameter']
affinity = result['affinity']
if affinity is not None and affinity != '':
props['affinity'] = float(affinity)

affinity_paramater = result['affinity_parameter']
if affinity_paramater:
props['affinity_parameter'] = f'p{result["affinity_parameter"]}'

# return to the caller
return predicate, pmids, props, provenance
Expand Down
Loading

0 comments on commit 79a8241

Please sign in to comment.