Skip to content

Commit

Permalink
fixing multiple qualifier issue
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Nov 19, 2024
1 parent 8f0e8d0 commit 93e92b5
Showing 1 changed file with 42 additions and 19 deletions.
61 changes: 42 additions & 19 deletions parsers/camkp/src/loadCAMKP.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class CAMKPLoader(SourceDataLoader):
source_data_url = "https://github.com/ExposuresProvider/cam-kp-api"
license = "https://github.com/ExposuresProvider/cam-kp-api/blob/master/LICENSE"
attribution = "https://github.com/ExposuresProvider/cam-kp-api"
parsing_version = "1.4"
parsing_version = "1.5"

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
Expand Down Expand Up @@ -97,29 +97,52 @@ def parse_data(self) -> dict:
predicate = line[CAMDATACOLS.PREDICATE.value]
edge_provenance_id = line[CAMDATACOLS.PROVENANCE_ID.value]
edge_provenance_url = line[CAMDATACOLS.PROVENANCE_URL.value]
edge_properties = {
XREFS: [edge_provenance_url],
KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
AGENT_TYPE: MANUAL_AGENT
}
# if it has enough columns to have a qualifier

# The following qualifier section is hacky and bad.
# This is mostly because the source data can have multiple instances of the same qualifier
# on the same edge, this splits those into multiple edges.
# Request to change the source data has been made and this should be changed after.

# If it has enough columns to have a qualifier
if len(line) >= CAMDATACOLS.QUALIFIERS.value + 1:
# qualifier format looks like:
# (biolink:anatomical_context_qualifier=GO:0005634)&&(biolink:anatomical_context_qualifier=CL:0008019)
# split on && then remove the parentheses
qualifiers = [qualifier.strip("()") for qualifier in line[CAMDATACOLS.QUALIFIERS.value].split('&&')]
qualifier_strings = [qualifier.strip("()") for qualifier in line[CAMDATACOLS.QUALIFIERS.value].split('&&')]

# parse the strings which are like biolink:anatomical_context_qualifier=GO:0005634
qualifiers = []
for qualifier_string in qualifier_strings:
qualifier_list = qualifier_string.split('=')
q_key = qualifier_list[0].removeprefix("biolink:")
if q_key != "anatomical_context_qualifier":
raise RuntimeError(f'Unsupported qualifier: {q_key}')
q_value = qualifier_list[1]
qualifiers.append({q_key: q_value})

# if the source data changes as mentioned above we could just do this and not split them up
# make a dict of qualifier_key: qualifier_value, remove the biolink prefix
qualifiers = {qual[0].removeprefix("biolink:"): qual[1] for qual in
[qualifier.split('=') for qualifier in qualifiers]}
edge_properties.update(qualifiers)
new_edge = kgxedge(subject_id=subject_id,
object_id=object_id,
predicate=predicate,
primary_knowledge_source=edge_provenance_id,
aggregator_knowledge_sources=[self.aggregator_knowledge_source],
edgeprops=edge_properties)
self.output_file_writer.write_kgx_edge(new_edge)
record_counter += 1
# qualifiers = {qual[0].removeprefix("biolink:"): qual[1] for qual in
# [qualifier.split('=') for qualifier in qualifiers]}
# edge_properties.update(qualifiers)
else:
qualifiers = [{}]

for qualifier in qualifiers:
edge_properties = {
XREFS: [edge_provenance_url],
KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
AGENT_TYPE: MANUAL_AGENT,
**qualifier
}
new_edge = kgxedge(subject_id=subject_id,
object_id=object_id,
predicate=predicate,
primary_knowledge_source=edge_provenance_id,
aggregator_knowledge_sources=[self.aggregator_knowledge_source],
edgeprops=edge_properties)
self.output_file_writer.write_kgx_edge(new_edge)
record_counter += 1

load_metadata: dict = {'num_source_lines': record_counter,
'unusable_source_lines': skipped_record_counter}
Expand Down

0 comments on commit 93e92b5

Please sign in to comment.