Skip to content

Commit

Permalink
Merge pull request #441 from biolink/find_dependency
Browse files Browse the repository at this point in the history
validation: speed up log_error method
  • Loading branch information
sierra-moxon authored Mar 29, 2023
2 parents 3399151 + 42634fc commit b151442
Show file tree
Hide file tree
Showing 10 changed files with 646 additions and 547 deletions.
11 changes: 3 additions & 8 deletions kgx/error_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,9 @@ def log_error(

if error not in self.errors[level]:
self.errors[level][error] = dict()

# don't record duplicate instances of error type and
# messages for entity identifiers...
if message not in self.errors[level][error]:
self.errors[level][error][message] = [entity]
else:
if entity not in self.errors[level][error][message]:
self.errors[level][error][message].append(entity)

self.errors[level][error][message] = [entity]
self.errors[level][error][message].append(entity)

def get_errors(self, level: str = None) -> Dict:
"""
Expand Down
6 changes: 5 additions & 1 deletion kgx/sink/rdf_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,12 @@ def write_edge(self, record: Dict) -> None:
value_uri = self._prepare_object(prop, prop_type, value)
self._write_triple(URIRef(n), prop_uri, value_uri)
else:
if "type" in record:
for type in record["type"]:
if type in associations:
at_least_one_type_in_associations = True
if (
("type" in record and record["type"] in associations)
("type" in record and at_least_one_type_in_associations)
or (
"association_type" in record
and record["association_type"] in associations
Expand Down
10 changes: 8 additions & 2 deletions kgx/utils/kgx_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import numpy as np
from prefixcommons.curie_util import contract_uri
from prefixcommons.curie_util import expand_uri

from kgx.config import get_logger, get_jsonld_context, get_biolink_model_schema
from kgx.graph.base_graph import BaseGraph

Expand All @@ -34,6 +33,8 @@
CORE_EDGE_PROPERTIES = {"id", "subject", "predicate", "object", "type"}
XSD_STRING = "xsd:string"

tk = Toolkit()


class GraphEntityType(Enum):
GRAPH = "graph"
Expand All @@ -53,6 +54,7 @@ class GraphEntityType(Enum):
"provided_by": list,
}


column_types = {
"publications": list,
"qualifiers": list,
Expand Down Expand Up @@ -833,6 +835,7 @@ def _sanitize_import_property(key: str, value: Any, list_delimiter: str) -> Any:
Sanitized value
"""

if key in column_types:
if column_types[key] == list:
if isinstance(value, (list, set, tuple)):
Expand All @@ -843,7 +846,7 @@ def _sanitize_import_property(key: str, value: Any, list_delimiter: str) -> Any:
new_value = list(value)
elif isinstance(value, str):
value = value.replace("\n", " ").replace("\t", " ")
new_value = [x for x in value.split(list_delimiter) if x] if list_delimiter else value
new_value = [x for x in value.split(list_delimiter) if x] if list_delimiter else [value]
else:
new_value = [str(value).replace("\n", " ").replace("\t", " ")]
# remove duplication in the list
Expand Down Expand Up @@ -871,9 +874,12 @@ def _sanitize_import_property(key: str, value: Any, list_delimiter: str) -> Any:
]
new_value = list(value)
elif isinstance(value, str):
multivalued_slots = [sentencecase_to_snakecase(x) for x in tk.get_all_multivalued_slots()]
if list_delimiter and list_delimiter in value:
value = value.replace("\n", " ").replace("\t", " ")
new_value = [x for x in value.split(list_delimiter) if x]
elif key in multivalued_slots:
new_value = [value]
else:
new_value = value.replace("\n", " ").replace("\t", " ")
elif isinstance(value, bool):
Expand Down
Loading

0 comments on commit b151442

Please sign in to comment.