fixing multiple qualifier issue

RobokopU24 · Nov 19, 2024 · 93e92b5 · 93e92b5
1 parent 8f0e8d0
commit 93e92b5
Showing 1 changed file with 42 additions and 19 deletions.
diff --git a/parsers/camkp/src/loadCAMKP.py b/parsers/camkp/src/loadCAMKP.py
@@ -36,7 +36,7 @@ class CAMKPLoader(SourceDataLoader):
     source_data_url = "https://github.com/ExposuresProvider/cam-kp-api"
     license = "https://github.com/ExposuresProvider/cam-kp-api/blob/master/LICENSE"
     attribution = "https://github.com/ExposuresProvider/cam-kp-api"
-    parsing_version = "1.4"
+    parsing_version = "1.5"
 
     def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         """
@@ -97,29 +97,52 @@ def parse_data(self) -> dict:
                 predicate = line[CAMDATACOLS.PREDICATE.value]
                 edge_provenance_id = line[CAMDATACOLS.PROVENANCE_ID.value]
                 edge_provenance_url = line[CAMDATACOLS.PROVENANCE_URL.value]
-                edge_properties = {
-                    XREFS: [edge_provenance_url],
-                    KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
-                    AGENT_TYPE: MANUAL_AGENT
-                }
-                # if it has enough columns to have a qualifier
+
+                # The following qualifier section is hacky and bad.
+                # This is mostly because the source data can have multiple instances of the same qualifier
+                # on the same edge, this splits those into multiple edges.
+                # Request to change the source data has been made and this should be changed after.
+
+                # If it has enough columns to have a qualifier
                 if len(line) >= CAMDATACOLS.QUALIFIERS.value + 1:
                     # qualifier format looks like:
                     # (biolink:anatomical_context_qualifier=GO:0005634)&&(biolink:anatomical_context_qualifier=CL:0008019)
                     # split on && then remove the parentheses
-                    qualifiers = [qualifier.strip("()") for qualifier in line[CAMDATACOLS.QUALIFIERS.value].split('&&')]
+                    qualifier_strings = [qualifier.strip("()") for qualifier in line[CAMDATACOLS.QUALIFIERS.value].split('&&')]
+
+                    # parse the strings which are like biolink:anatomical_context_qualifier=GO:0005634
+                    qualifiers = []
+                    for qualifier_string in qualifier_strings:
+                        qualifier_list = qualifier_string.split('=')
+                        q_key = qualifier_list[0].removeprefix("biolink:")
+                        if q_key != "anatomical_context_qualifier":
+                            raise RuntimeError(f'Unsupported qualifier: {q_key}')
+                        q_value = qualifier_list[1]
+                        qualifiers.append({q_key: q_value})
+
+                    # if the source data changes as mentioned above we could just do this and not split them up
                     # make a dict of qualifier_key: qualifier_value, remove the biolink prefix
-                    qualifiers = {qual[0].removeprefix("biolink:"): qual[1] for qual in
-                                  [qualifier.split('=') for qualifier in qualifiers]}
-                    edge_properties.update(qualifiers)
-                new_edge = kgxedge(subject_id=subject_id,
-                                   object_id=object_id,
-                                   predicate=predicate,
-                                   primary_knowledge_source=edge_provenance_id,
-                                   aggregator_knowledge_sources=[self.aggregator_knowledge_source],
-                                   edgeprops=edge_properties)
-                self.output_file_writer.write_kgx_edge(new_edge)
-                record_counter += 1
+                    # qualifiers = {qual[0].removeprefix("biolink:"): qual[1] for qual in
+                    #             [qualifier.split('=') for qualifier in qualifiers]}
+                    # edge_properties.update(qualifiers)
+                else:
+                    qualifiers = [{}]
+
+                for qualifier in qualifiers:
+                    edge_properties = {
+                        XREFS: [edge_provenance_url],
+                        KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION,
+                        AGENT_TYPE: MANUAL_AGENT,
+                        **qualifier
+                    }
+                    new_edge = kgxedge(subject_id=subject_id,
+                                       object_id=object_id,
+                                       predicate=predicate,
+                                       primary_knowledge_source=edge_provenance_id,
+                                       aggregator_knowledge_sources=[self.aggregator_knowledge_source],
+                                       edgeprops=edge_properties)
+                    self.output_file_writer.write_kgx_edge(new_edge)
+                    record_counter += 1
 
         load_metadata: dict = {'num_source_lines': record_counter,
                                'unusable_source_lines': skipped_record_counter}