Skip to content

Commit

Permalink
Enhancements to improve VRS computation performance (#310)
Browse files Browse the repository at this point in the history
* Remove unecessary copy of input allele

* Allow enref() caller to get back id and object as tuple

* Don't generate an id if the object already has one

* Added a context manager that controls how ga4gh_identify decides whether or not to compute an identifier

* Decorated annotate() method to eliminate unecessary identifier computation

* Added unit tests to verify new behavior of ga4gh_identify()
  • Loading branch information
ehclark committed Jan 4, 2024
1 parent 1a25095 commit 50af18d
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/ga4gh/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ._internal.exceptions import GA4GHError
from ._internal.identifiers import (
ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier,
parse_ga4gh_identifier
parse_ga4gh_identifier, GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when
)
from ._internal.pydantic import (
is_pydantic_instance, is_curie_type, is_identifiable, is_literal, pydantic_copy
Expand Down
6 changes: 3 additions & 3 deletions src/ga4gh/core/_internal/enderef.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
_logger = logging.getLogger(__name__)


def ga4gh_enref(o, cra_map, object_store=None):
def ga4gh_enref(o, cra_map, object_store=None, return_id_obj_tuple=False):
"""Recursively convert "referable attributes" from inlined to
referenced form. Returns a new object.
Expand Down Expand Up @@ -65,8 +65,8 @@ def _enref(o):

# in-place replacement on object copy
o = pydantic_copy(o)
_enref(o)
return o
_id = _enref(o)
return (_id, o) if return_id_obj_tuple else o


def ga4gh_deref(o, cra_map, object_store):
Expand Down
71 changes: 68 additions & 3 deletions src/ga4gh/core/_internal/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
"""

import contextvars
import logging
import re
from contextlib import ContextDecorator
from enum import IntEnum
from typing import Union, Optional
from pydantic import BaseModel, RootModel
from canonicaljson import encode_canonical_json
Expand Down Expand Up @@ -44,6 +47,53 @@
ns_w_sep = namespace + curie_sep


class GA4GHComputeIdentifierWhen(IntEnum):
"""
Defines the rule for when the `ga4gh_identify` method should compute
an identifier ('id' attribute) for the specified object. The options are:
ALWAYS - Always compute the identifier (this is the default behavior)
INVALID - Compute the identifier if it is missing or is present but syntactically invalid
MISSING - Only compute the identifier if missing
The default behavior is safe and ensures that the identifiers are correct,
but at a performance cost. Where the source of inputs to `ga4gh_identify`
are well controlled, for example when annotating a VCF file with VRS IDs,
using `MISSING` can improve performance.
"""

ALWAYS = 0
INVALID = 1
MISSING = 2


ga4gh_compute_identifier_when = contextvars.ContextVar("ga4gh_compute_identifier_when")


class use_ga4gh_compute_identifier_when(ContextDecorator):
"""
Context manager that defines when to compute identifiers
for all operations within the context. For example:
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
VCFAnnotator(...).annotate(...)
Or:
@use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID)
def my_method():
"""

def __init__(self, when: GA4GHComputeIdentifierWhen):
self.when = when
self.token = None

def __enter__(self):
self.token = ga4gh_compute_identifier_when.set(self.when)

def __exit__(self, exc_type, exc, exc_tb):
ga4gh_compute_identifier_when.reset(self.token)


def is_ga4gh_identifier(ir):
"""
Expand Down Expand Up @@ -94,10 +144,25 @@ def ga4gh_identify(vro):
"""
if is_identifiable(vro):
digest = ga4gh_digest(vro)
pfx = vro.ga4gh.prefix
ir = f"{namespace}{curie_sep}{pfx}{ref_sep}{digest}"
when_rule = ga4gh_compute_identifier_when.get(GA4GHComputeIdentifierWhen.ALWAYS)
do_compute = False
ir = None
if when_rule == GA4GHComputeIdentifierWhen.ALWAYS:
do_compute = True
else:
ir = getattr(vro, "id", None)
if when_rule == GA4GHComputeIdentifierWhen.MISSING:
do_compute = ir is None or ir == ""
else: # INVALID
do_compute = ir is None or ir == "" or ga4gh_ir_regexp.match(ir) is None

if do_compute:
digest = ga4gh_digest(vro)
pfx = vro.ga4gh.prefix
ir = f"{namespace}{curie_sep}{pfx}{ref_sep}{digest}"

return ir

return None


Expand Down
4 changes: 2 additions & 2 deletions src/ga4gh/vrs/_internal/enderef.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from .models import class_refatt_map


def vrs_enref(o, object_store=None):
return ga4gh_enref(o, cra_map=class_refatt_map, object_store=object_store)
def vrs_enref(o, object_store=None, return_id_obj_tuple=False):
return ga4gh_enref(o, cra_map=class_refatt_map, object_store=object_store, return_id_obj_tuple=return_id_obj_tuple)


def vrs_deref(o, object_store):
Expand Down
2 changes: 2 additions & 0 deletions src/ga4gh/vrs/extras/vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from biocommons.seqrepo import SeqRepo
from pydantic import ValidationError

from ga4gh.core import GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when
from ga4gh.vrs.dataproxy import SeqRepoDataProxy, SeqRepoRESTDataProxy
from ga4gh.vrs.extras.translator import AlleleTranslator, ValidationError as TranslatorValidationError

Expand Down Expand Up @@ -161,6 +162,7 @@ def __init__(self, seqrepo_dp_type: SeqRepoProxyType = SeqRepoProxyType.LOCAL,
self.dp = SeqRepoRESTDataProxy(seqrepo_base_url)
self.tlr = AlleleTranslator(self.dp)

@use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING)
def annotate( # pylint: disable=too-many-arguments,too-many-locals
self, vcf_in: str, vcf_out: Optional[str] = None,
vrs_pickle_out: Optional[str] = None, vrs_attributes: bool = False,
Expand Down
15 changes: 7 additions & 8 deletions src/ga4gh/vrs/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,9 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):
Does not attempt to normalize Alleles with definite ranges and will instead return the
`input_allele`
"""
allele = pydantic_copy(input_allele)

if isinstance(allele.location.sequenceReference, models.SequenceReference):
alias = f"ga4gh:{allele.location.sequenceReference.refgetAccession}"
if isinstance(input_allele.location.sequenceReference, models.SequenceReference):
alias = f"ga4gh:{input_allele.location.sequenceReference.refgetAccession}"
else:
_logger.warning(
"`input_allele.location.sequenceReference` expects a `SequenceReference`, "
Expand All @@ -115,17 +114,17 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):

# Get reference sequence and interval
ref_seq = SequenceProxy(data_proxy, alias)
start = _get_allele_location_pos(allele, use_start=True)
start = _get_allele_location_pos(input_allele, use_start=True)
if start is None:
return input_allele

end = _get_allele_location_pos(allele, use_start=False)
end = _get_allele_location_pos(input_allele, use_start=False)
if end is None:
return input_allele

ival = (start.value, end.value)
if allele.state.sequence:
alleles = (None, allele.state.sequence.root)
if input_allele.state.sequence:
alleles = (None, input_allele.state.sequence.root)
else:
alleles = (None, "")

Expand All @@ -145,7 +144,7 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):
if not len_ref_seq and not len_alt_seq:
return input_allele

new_allele = pydantic_copy(allele)
new_allele = pydantic_copy(input_allele)

if len_ref_seq and len_alt_seq:
new_allele.location.start = _get_new_allele_location_pos(
Expand Down
93 changes: 92 additions & 1 deletion tests/test_vrs2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
ga4gh_identify,
is_pydantic_instance,
is_curie_type,
pydantic_copy)
pydantic_copy,
use_ga4gh_compute_identifier_when,
GA4GHComputeIdentifierWhen,
)
from ga4gh.vrs import models, vrs_enref, vrs_deref

allele_dict = {
Expand Down Expand Up @@ -245,3 +248,91 @@ def test_class_refatt_map():
'GenotypeMember': ['variation']
}
assert class_refatt_map_expected == models.class_refatt_map


def test_compute_identifiers_when():
a = {
"type": "Allele",
"location": {
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
"refgetAccession": "SQ.jdEWLvLvT8827O59m1Agh5H3n6kTzBsJ",
},
"start": 44908821,
"end": 44908822,
},
"state": {"type": "LiteralSequenceExpression", "sequence": "T"},
}
correct_id = "ga4gh:VA.PkeY9RbMt9CEFakQ0AgDdAQ7POUeoWR0"
syntax_valid_id = "ga4gh:VA.39eae078d9bb30da2a5c5d1969cb1472"
syntax_invalid_id = "ga4gh:12345"

# when id property is missing
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == correct_id

# when id property is none
a["id"] = None
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == correct_id

# when id property is blank
a["id"] = ""
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == correct_id

# when id property is syntactically invalid
a["id"] = syntax_invalid_id
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == syntax_invalid_id

# when id property is syntactically valid
a["id"] = syntax_valid_id
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == syntax_valid_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == syntax_valid_id

# when id property is correct
a["id"] = correct_id
vo_a = models.Allele(**a)
assert ga4gh_identify(vo_a) == correct_id
assert ga4gh_identify(vo_a) is not correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.ALWAYS):
assert ga4gh_identify(vo_a) == correct_id
assert ga4gh_identify(vo_a) is not correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.INVALID):
assert ga4gh_identify(vo_a) == correct_id
assert ga4gh_identify(vo_a) is correct_id
with use_ga4gh_compute_identifier_when(GA4GHComputeIdentifierWhen.MISSING):
assert ga4gh_identify(vo_a) == correct_id
assert ga4gh_identify(vo_a) is correct_id

0 comments on commit 50af18d

Please sign in to comment.