From 50af18d731c2d8489ef4251ad9cc3eecab6718a1 Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 4 Jan 2024 16:04:55 -0500 Subject: [PATCH] Enhancements to improve VRS computation performance (#310) * Remove unecessary copy of input allele * Allow enref() caller to get back id and object as tuple * Don't generate an id if the object already has one * Added a context manager that controls how ga4gh_identify decides whether or not to compute an identifier * Decorated annotate() method to eliminate unecessary identifier computation * Added unit tests to verify new behavior of ga4gh_identify() --- src/ga4gh/core/__init__.py | 2 +- src/ga4gh/core/_internal/enderef.py | 6 +- src/ga4gh/core/_internal/identifiers.py | 71 ++++++++++++++++++- src/ga4gh/vrs/_internal/enderef.py | 4 +- src/ga4gh/vrs/extras/vcf_annotation.py | 2 + src/ga4gh/vrs/normalize.py | 15 ++-- tests/test_vrs2.py | 93 ++++++++++++++++++++++++- 7 files changed, 175 insertions(+), 18 deletions(-) diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index d78d22e0..c2de1300 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -10,7 +10,7 @@ from ._internal.exceptions import GA4GHError from ._internal.identifiers import ( ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier, - parse_ga4gh_identifier + parse_ga4gh_identifier, GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when ) from ._internal.pydantic import ( is_pydantic_instance, is_curie_type, is_identifiable, is_literal, pydantic_copy diff --git a/src/ga4gh/core/_internal/enderef.py b/src/ga4gh/core/_internal/enderef.py index 16198a39..1f4c4001 100644 --- a/src/ga4gh/core/_internal/enderef.py +++ b/src/ga4gh/core/_internal/enderef.py @@ -22,7 +22,7 @@ _logger = logging.getLogger(__name__) -def ga4gh_enref(o, cra_map, object_store=None): +def ga4gh_enref(o, cra_map, object_store=None, return_id_obj_tuple=False): """Recursively convert "referable attributes" from inlined to referenced form. Returns a new object. @@ -65,8 +65,8 @@ def _enref(o): # in-place replacement on object copy o = pydantic_copy(o) - _enref(o) - return o + _id = _enref(o) + return (_id, o) if return_id_obj_tuple else o def ga4gh_deref(o, cra_map, object_store): diff --git a/src/ga4gh/core/_internal/identifiers.py b/src/ga4gh/core/_internal/identifiers.py index 3df16352..bdf1b0b3 100644 --- a/src/ga4gh/core/_internal/identifiers.py +++ b/src/ga4gh/core/_internal/identifiers.py @@ -15,8 +15,11 @@ """ +import contextvars import logging import re +from contextlib import ContextDecorator +from enum import IntEnum from typing import Union, Optional from pydantic import BaseModel, RootModel from canonicaljson import encode_canonical_json @@ -44,6 +47,53 @@ ns_w_sep = namespace + curie_sep +class GA4GHComputeIdentifierWhen(IntEnum): + """ + Defines the rule for when the `ga4gh_identify` method should compute + an identifier ('id' attribute) for the specified object. The options are: + ALWAYS - Always compute the identifier (this is the default behavior) + INVALID - Compute the identifier if it is missing or is present but syntactically invalid + MISSING - Only compute the identifier if missing + + The default behavior is safe and ensures that the identifiers are correct, + but at a performance cost. Where the source of inputs to `ga4gh_identify` + are well controlled, for example when annotating a VCF file with VRS IDs, + using `MISSING` can improve performance. + """ + + ALWAYS = 0 + INVALID = 1 + MISSING = 2 + + +ga4gh_compute_identifier_when = contextvars.ContextVar("ga4gh_compute_identifier_when") + + +class use_ga4gh_compute_identifier_when(ContextDecorator): + """ + Context manager that defines when to compute identifiers + for all operations within the context. For example: + + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + VCFAnnotator(...).annotate(...) + + Or: + + @use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID) + def my_method(): + """ + + def __init__(self, when: GA4GHComputeIdentifierWhen): + self.when = when + self.token = None + + def __enter__(self): + self.token = ga4gh_compute_identifier_when.set(self.when) + + def __exit__(self, exc_type, exc, exc_tb): + ga4gh_compute_identifier_when.reset(self.token) + + def is_ga4gh_identifier(ir): """ @@ -94,10 +144,25 @@ def ga4gh_identify(vro): """ if is_identifiable(vro): - digest = ga4gh_digest(vro) - pfx = vro.ga4gh.prefix - ir = f"{namespace}{curie_sep}{pfx}{ref_sep}{digest}" + when_rule = ga4gh_compute_identifier_when.get(GA4GHComputeIdentifierWhen.ALWAYS) + do_compute = False + ir = None + if when_rule == GA4GHComputeIdentifierWhen.ALWAYS: + do_compute = True + else: + ir = getattr(vro, "id", None) + if when_rule == GA4GHComputeIdentifierWhen.MISSING: + do_compute = ir is None or ir == "" + else: # INVALID + do_compute = ir is None or ir == "" or ga4gh_ir_regexp.match(ir) is None + + if do_compute: + digest = ga4gh_digest(vro) + pfx = vro.ga4gh.prefix + ir = f"{namespace}{curie_sep}{pfx}{ref_sep}{digest}" + return ir + return None diff --git a/src/ga4gh/vrs/_internal/enderef.py b/src/ga4gh/vrs/_internal/enderef.py index 522d83de..50b1ac8a 100644 --- a/src/ga4gh/vrs/_internal/enderef.py +++ b/src/ga4gh/vrs/_internal/enderef.py @@ -3,8 +3,8 @@ from .models import class_refatt_map -def vrs_enref(o, object_store=None): - return ga4gh_enref(o, cra_map=class_refatt_map, object_store=object_store) +def vrs_enref(o, object_store=None, return_id_obj_tuple=False): + return ga4gh_enref(o, cra_map=class_refatt_map, object_store=object_store, return_id_obj_tuple=return_id_obj_tuple) def vrs_deref(o, object_store): diff --git a/src/ga4gh/vrs/extras/vcf_annotation.py b/src/ga4gh/vrs/extras/vcf_annotation.py index 144f9c4d..f2a6610f 100644 --- a/src/ga4gh/vrs/extras/vcf_annotation.py +++ b/src/ga4gh/vrs/extras/vcf_annotation.py @@ -15,6 +15,7 @@ from biocommons.seqrepo import SeqRepo from pydantic import ValidationError +from ga4gh.core import GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when from ga4gh.vrs.dataproxy import SeqRepoDataProxy, SeqRepoRESTDataProxy from ga4gh.vrs.extras.translator import AlleleTranslator, ValidationError as TranslatorValidationError @@ -161,6 +162,7 @@ def __init__(self, seqrepo_dp_type: SeqRepoProxyType = SeqRepoProxyType.LOCAL, self.dp = SeqRepoRESTDataProxy(seqrepo_base_url) self.tlr = AlleleTranslator(self.dp) + @use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING) def annotate( # pylint: disable=too-many-arguments,too-many-locals self, vcf_in: str, vcf_out: Optional[str] = None, vrs_pickle_out: Optional[str] = None, vrs_attributes: bool = False, diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 4e544f29..a92a6723 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -102,10 +102,9 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): Does not attempt to normalize Alleles with definite ranges and will instead return the `input_allele` """ - allele = pydantic_copy(input_allele) - if isinstance(allele.location.sequenceReference, models.SequenceReference): - alias = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" + if isinstance(input_allele.location.sequenceReference, models.SequenceReference): + alias = f"ga4gh:{input_allele.location.sequenceReference.refgetAccession}" else: _logger.warning( "`input_allele.location.sequenceReference` expects a `SequenceReference`, " @@ -115,17 +114,17 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): # Get reference sequence and interval ref_seq = SequenceProxy(data_proxy, alias) - start = _get_allele_location_pos(allele, use_start=True) + start = _get_allele_location_pos(input_allele, use_start=True) if start is None: return input_allele - end = _get_allele_location_pos(allele, use_start=False) + end = _get_allele_location_pos(input_allele, use_start=False) if end is None: return input_allele ival = (start.value, end.value) - if allele.state.sequence: - alleles = (None, allele.state.sequence.root) + if input_allele.state.sequence: + alleles = (None, input_allele.state.sequence.root) else: alleles = (None, "") @@ -145,7 +144,7 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): if not len_ref_seq and not len_alt_seq: return input_allele - new_allele = pydantic_copy(allele) + new_allele = pydantic_copy(input_allele) if len_ref_seq and len_alt_seq: new_allele.location.start = _get_new_allele_location_pos( diff --git a/tests/test_vrs2.py b/tests/test_vrs2.py index 0e67ef4d..3de0ad8a 100644 --- a/tests/test_vrs2.py +++ b/tests/test_vrs2.py @@ -5,7 +5,10 @@ ga4gh_identify, is_pydantic_instance, is_curie_type, - pydantic_copy) + pydantic_copy, + use_ga4gh_compute_identifier_when, + GA4GHComputeIdentifierWhen, +) from ga4gh.vrs import models, vrs_enref, vrs_deref allele_dict = { @@ -245,3 +248,91 @@ def test_class_refatt_map(): 'GenotypeMember': ['variation'] } assert class_refatt_map_expected == models.class_refatt_map + + +def test_compute_identifiers_when(): + a = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.jdEWLvLvT8827O59m1Agh5H3n6kTzBsJ", + }, + "start": 44908821, + "end": 44908822, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "T"}, + } + correct_id = "ga4gh:VA.PkeY9RbMt9CEFakQ0AgDdAQ7POUeoWR0" + syntax_valid_id = "ga4gh:VA.39eae078d9bb30da2a5c5d1969cb1472" + syntax_invalid_id = "ga4gh:12345" + + # when id property is missing + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == correct_id + + # when id property is none + a["id"] = None + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == correct_id + + # when id property is blank + a["id"] = "" + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == correct_id + + # when id property is syntactically invalid + a["id"] = syntax_invalid_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == syntax_invalid_id + + # when id property is syntactically valid + a["id"] = syntax_valid_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == syntax_valid_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == syntax_valid_id + + # when id property is correct + a["id"] = correct_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a) == correct_id + assert ga4gh_identify(vo_a) is not correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS): + assert ga4gh_identify(vo_a) == correct_id + assert ga4gh_identify(vo_a) is not correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID): + assert ga4gh_identify(vo_a) == correct_id + assert ga4gh_identify(vo_a) is correct_id + with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING): + assert ga4gh_identify(vo_a) == correct_id + assert ga4gh_identify(vo_a) is correct_id