Skip to content

Commit

Permalink
Merge pull request #252 from RobokopU24/neo4j_5
Browse files Browse the repository at this point in the history
Neo4j 5
  • Loading branch information
beasleyjonm authored Aug 13, 2024
2 parents 74ea571 + 0a22609 commit b124dfb
Show file tree
Hide file tree
Showing 24 changed files with 520 additions and 236 deletions.
20 changes: 10 additions & 10 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,27 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out the repo
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Get the version
id: get_version
run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
with:
images:
ghcr.io/${{ github.repository }}
- name: Login to ghcr
uses: docker/login-action@v1
uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images:
ghcr.io/${{ github.repository }}
- name: Push to GitHub Packages
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: VERSION=${{ steps.get_version.outputs.VERSION }}
build-args: VERSION=${{ steps.get_version.outputs.VERSION }}
6 changes: 6 additions & 0 deletions Common/biolink_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,13 @@
PREDICATE = 'predicate'
PRIMARY_KNOWLEDGE_SOURCE = 'primary_knowledge_source'
AGGREGATOR_KNOWLEDGE_SOURCES = 'aggregator_knowledge_source'
SUPPORTING_DATA_SOURCE = 'supporting_data_source'
P_VALUE = 'p_value'
ADJUSTED_P_VALUE = 'adjusted_p_value'
AGENT_TYPE = 'agent_type'
KNOWLEDGE_LEVEL = 'knowledge_level'
MAX_RESEARCH_PHASE = 'max_research_phase'
HAS_SUPPORTING_STUDY_RESULT = 'has_supporting_study_result'

# enums for knowledge level
KNOWLEDGE_ASSERTION = 'knowledge_assertion'
Expand Down Expand Up @@ -137,6 +140,7 @@
PREDICATE,
PRIMARY_KNOWLEDGE_SOURCE,
AGGREGATOR_KNOWLEDGE_SOURCES,
SUPPORTING_DATA_SOURCE,
PUBLICATIONS,
SYNONYMS,
DESCRIPTION,
Expand All @@ -147,6 +151,8 @@
FDA_APPROVAL_STATUS,
KNOWLEDGE_LEVEL,
MECHANISM_OF_ACTION,
MAX_RESEARCH_PHASE,
HAS_SUPPORTING_STUDY_RESULT,
# qualifiers
ANATOMICAL_CONTEXT_QUALIFIER,
CAUSAL_MECHANISM_QUALIFIER,
Expand Down
7 changes: 3 additions & 4 deletions Common/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from Common.load_manager import SourceDataManager
from Common.kgx_file_merger import KGXFileMerger
from Common.neo4j_tools import create_neo4j_dump
from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource, NormalizationScheme
from Common.normalization import NORMALIZATION_CODE_VERSION
from Common.kgxmodel import GraphSpec, SubGraphSource, DataSource
from Common.normalization import NORMALIZATION_CODE_VERSION, NormalizationScheme
from Common.metadata import Metadata, GraphMetadata, SourceMetadata
from Common.supplementation import SequenceVariantSupplementation
from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, PREDICATE, PUBLICATIONS
Expand Down Expand Up @@ -139,8 +139,7 @@ def build_dependencies(self, graph_spec: GraphSpec):
subgraph_version = subgraph_source.version
if self.check_for_existing_graph_dir(subgraph_id, subgraph_version):
# load previous metadata
graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version)
subgraph_source.graph_metadata = graph_metadata.metadata
subgraph_source.graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version)
elif self.current_graph_versions[subgraph_id] == subgraph_version:
self.logger.warning(f'For graph {graph_spec.graph_id} subgraph dependency '
f'{subgraph_id} version {subgraph_version} is not ready. Building now...')
Expand Down
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
BINDING_DB = 'BINDING-DB'
CAM_KP = 'CAM-KP'
CHEBI_PROPERTIES = 'CHEBIProps'
CLINICAL_TRIALS_KP = 'ClinicalTrialsKP'
CORD19 = 'Cord19'
CTD = 'CTD'
DRUG_CENTRAL = 'DrugCentral'
Expand Down Expand Up @@ -51,6 +52,7 @@
BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"),
CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"),
CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"),
CLINICAL_TRIALS_KP: ("parsers.clinicaltrials.src.loadCTKP", "CTKPLoader"),
CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"),
CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"),
DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"),
Expand Down
9 changes: 6 additions & 3 deletions Common/kgx_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __determine_properties_and_types(file_path: str, required_properties: dict):
for key, value in entity.items():
if value is None:
property_type_counts[key]["None"] += 1
if key in required_properties:
if key in required_properties and key != "name":
print(f'WARNING: Required property ({key}) was None: {entity.items()}')
raise Exception(
f'None found as a value for a required property (property: {key}) in line {entity.items()}')
Expand Down Expand Up @@ -134,7 +134,7 @@ def __determine_properties_and_types(file_path: str, required_properties: dict):
# if 'None' in prop_types:
# print(f'WARNING: None found as a value for property {prop}')

if prop in required_properties and (num_prop_types > 1):
if prop in required_properties and (num_prop_types > 1) and prop != "name":
# TODO this should just enforce that required properties are the correct type,
# instead of trying to establish the type
raise Exception(f'Required property {prop} had multiple conflicting types: {type_counts.items()}')
Expand Down Expand Up @@ -192,7 +192,10 @@ def __convert_to_csv(input_file: str,
for item in quick_jsonl_file_iterator(input_file):
for key in list(item.keys()):
if item[key] is None:
del item[key]
if key == "name":
item["name"] = item["id"]
else:
del item[key]
else:
prop_type = properties[key]
# convert lists into strings with an array delimiter
Expand Down
6 changes: 4 additions & 2 deletions Common/kgx_file_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ def merge_primary_sources(self,
needs_on_disk_merge = False
for graph_source in graph_sources:
if isinstance(graph_source, SubGraphSource):
needs_on_disk_merge = True
break
for source_id in graph_source.graph_metadata.get_source_ids():
if source_id in RESOURCE_HOGS:
needs_on_disk_merge = True
break
elif graph_source.id in RESOURCE_HOGS:
needs_on_disk_merge = True
break
Expand Down
17 changes: 3 additions & 14 deletions Common/kgx_file_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,13 @@
from Common.biolink_utils import BiolinkInformationResources, INFORES_STATUS_INVALID, INFORES_STATUS_DEPRECATED
from Common.biolink_constants import SEQUENCE_VARIANT, PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \
PUBLICATIONS, OBJECT_ID, SUBJECT_ID, PREDICATE, SUBCLASS_OF
from Common.normalization import NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult
from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult, \
NormalizationFailedError
from Common.utils import LoggingUtil, chunk_iterator
from Common.kgx_file_writer import KGXFileWriter
from Common.kgxmodel import NormalizationScheme
from Common.merging import MemoryGraphMerger, DiskGraphMerger


class NormalizationBrokenError(Exception):
def __init__(self, error_message: str, actual_error: Exception=None):
self.error_message = error_message
self.actual_error = actual_error


class NormalizationFailedError(Exception):
def __init__(self, error_message: str, actual_error: Exception=None):
self.error_message = error_message
self.actual_error = actual_error


EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS}
NODE_NORMALIZATION_BATCH_SIZE = 1_000_000
EDGE_NORMALIZATION_BATCH_SIZE = 1_000_000
Expand Down Expand Up @@ -350,6 +338,7 @@ def normalize_edge_file(self):
# this could happen due to rare cases of normalization splits where one node normalizes to many
if edge_count > 1:
edge_splits += edge_count - 1

graph_merger.merge_edges(normalized_edges)
self.logger.info(f'Processed {number_of_source_edges} edges so far...')

Expand Down
32 changes: 4 additions & 28 deletions Common/kgxmodel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from Common.biolink_constants import NAMED_THING
from Common.normalization import NORMALIZATION_CODE_VERSION
from Common.metadata import GraphMetadata
from Common.normalization import NormalizationScheme

class kgxnode:
def __init__(self,
Expand Down Expand Up @@ -33,31 +34,6 @@ def __init__(self,
self.properties = {}


@dataclass
class NormalizationScheme:
node_normalization_version: str = 'latest'
edge_normalization_version: str = 'latest'
normalization_code_version: str = NORMALIZATION_CODE_VERSION
strict: bool = True
conflation: bool = False

def get_composite_normalization_version(self):
composite_normalization_version = f'{self.node_normalization_version}_' \
f'{self.edge_normalization_version}_{self.normalization_code_version}'
if self.conflation:
composite_normalization_version += '_conflated'
if self.strict:
composite_normalization_version += '_strict'
return composite_normalization_version

def get_metadata_representation(self):
return {'node_normalization_version': self.node_normalization_version,
'edge_normalization_version': self.edge_normalization_version,
'normalization_code_version': self.normalization_code_version,
'conflation': self.conflation,
'strict': self.strict}


@dataclass
class GraphSpec:
graph_id: str
Expand Down Expand Up @@ -91,13 +67,13 @@ class GraphSource:

@dataclass
class SubGraphSource(GraphSource):
graph_metadata: dict = None
graph_metadata: GraphMetadata = None

def get_metadata_representation(self):
return {'graph_id': self.id,
'release_version': self.version,
'merge_strategy:': self.merge_strategy,
'graph_metadata': self.graph_metadata}
'graph_metadata': self.graph_metadata.metadata if self.graph_metadata else None}


@dataclass
Expand Down
16 changes: 2 additions & 14 deletions Common/load_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

from Common.data_sources import SourceDataLoaderClassFactory, RESOURCE_HOGS, get_available_data_sources
from Common.utils import LoggingUtil, GetDataPullError
from Common.kgx_file_normalizer import KGXFileNormalizer, NormalizationBrokenError, NormalizationFailedError
from Common.kgxmodel import NormalizationScheme
from Common.normalization import NodeNormalizer, EdgeNormalizer
from Common.kgx_file_normalizer import KGXFileNormalizer
from Common.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, NormalizationFailedError
from Common.metadata import SourceMetadata
from Common.loader_interface import SourceDataBrokenError, SourceDataFailedError
from Common.supplementation import SequenceVariantSupplementation, SupplementationFailedError
Expand Down Expand Up @@ -356,17 +355,6 @@ def normalize_source(self,
normalization_status=SourceMetadata.STABLE,
normalization_info=normalization_info)
return True
except NormalizationBrokenError as broken_error:
error_message = f"{source_id} NormalizationBrokenError: {broken_error.error_message}"
if broken_error.actual_error:
error_message += f" - {broken_error.actual_error}"
self.logger.error(error_message)
source_metadata.update_normalization_metadata(parsing_version,
composite_normalization_version,
normalization_status=SourceMetadata.BROKEN,
normalization_error=error_message,
normalization_time=current_time)
return False
except NormalizationFailedError as failed_error:
error_message = f"{source_id} NormalizationFailedError: {failed_error.error_message}"
if failed_error.actual_error:
Expand Down
37 changes: 27 additions & 10 deletions Common/merging.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,34 @@ def edge_key_function(edge):


def entity_merging_function(entity_1, entity_2, properties_that_are_sets):
for key, value in entity_2.items():
# TODO - make sure this is the behavior we want -
# for properties that are lists append the values
# otherwise keep the first one
if key in entity_1:
if isinstance(value, list):
entity_1[key].extend(value)
if key in properties_that_are_sets:
entity_1[key] = list(set(entity_1[key]))
# for every property of entity 2
for key, entity_2_value in entity_2.items():
# if entity 1 also has the property and entity_2_value is not null/empty:
# concatenate values if one is a list, otherwise ignore the property from entity 2
if (key in entity_1) and entity_2_value:
entity_1_value = entity_1[key]
entity_1_is_list = isinstance(entity_1_value, list)
entity_2_is_list = isinstance(entity_2_value, list)
if entity_1_is_list and entity_2_is_list:
# if they're both lists just combine them
entity_1_value.extend(entity_2_value)
elif entity_1_is_list:
# if 1 is a list and 2 isn't, append the value of 2 to the list from 1
entity_1_value.append(entity_2_value)
elif entity_2_is_list:
if entity_1_value:
# if 2 is a list and 1 has a value, add the value of 1 to the list from 2
entity_1[key] = [entity_1_value] + entity_2_value
else:
# if 2 is a list and 1 doesn't have a value, just use the list from 2
entity_1[key] = entity_2_value
# else:
# if neither is a list, do nothing (keep the value from 1)
if (entity_1_is_list or entity_2_is_list) and (key in properties_that_are_sets):
entity_1[key] = list(set(entity_1[key]))
else:
entity_1[key] = value
# if entity 1 doesn't have the property, add the property from entity 2
entity_1[key] = entity_2_value
return entity_1


Expand Down
5 changes: 4 additions & 1 deletion Common/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
from xxhash import xxh64_hexdigest

from Common.kgxmodel import NormalizationScheme
from Common.normalization import NormalizationScheme


class Metadata:
Expand Down Expand Up @@ -122,6 +122,9 @@ def get_build_status(self):
def get_graph_version(self):
return self.metadata['graph_version']

def get_source_ids(self):
return [source['source_id'] for source in self.metadata['sources']]


class SourceMetadata(Metadata):

Expand Down
Loading

0 comments on commit b124dfb

Please sign in to comment.