Skip to content

Commit

Permalink
feat: VRS 1.3 digests may be computed for Alleles and Sequence Locati…
Browse files Browse the repository at this point in the history
…ons (#427)

closes #382

---------

Co-authored-by: Kori Kuzma <korikuzma@gmail.com>
  • Loading branch information
ahwagner and korikuzma committed Jul 16, 2024
1 parent 448538c commit 3347873
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 20 deletions.
33 changes: 24 additions & 9 deletions src/ga4gh/core/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def parse_ga4gh_identifier(ir):
raise ValueError(ir) from e


def ga4gh_identify(vro, in_place='default'):
def ga4gh_identify(vro, in_place='default', as_version=None):
"""
Return the GA4GH digest-based id for the object, as a CURIE
(string). Returns None if object is not identifiable.
Expand All @@ -137,6 +137,10 @@ def ga4gh_identify(vro, in_place='default'):
- 'never': the vro.id field will not be edited in-place,
even when empty
If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.
TODO update example for VRS 2.0
>>> import ga4gh.vrs
>>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822)
Expand All @@ -158,18 +162,20 @@ def ga4gh_identify(vro, in_place='default'):
do_compute = not vro.has_valid_ga4gh_id()

if do_compute:
obj_id = vro.get_or_create_ga4gh_identifier(in_place)
obj_id = vro.get_or_create_ga4gh_identifier(in_place, as_version=as_version)

return obj_id

return None


def ga4gh_digest(vro: BaseModel, overwrite=False):
def ga4gh_digest(vro: BaseModel, overwrite=False, as_version=None):
"""
Return the GA4GH digest for the object.
do_compact: bool - true if object compaction should be performed during serialization
If 'as_version' is set to a version string, other parameters
are ignored and a digest returned following the conventions of
the VRS version indicated by 'as_version'.
TODO update example
Expand All @@ -181,7 +187,10 @@ def ga4gh_digest(vro: BaseModel, overwrite=False):
"""
if vro.is_ga4gh_identifiable(): # Only GA4GH identifiable objects are GA4GH digestible
return vro.get_or_create_digest(overwrite)
if as_version is None:
return vro.get_or_create_digest(overwrite)
else:
return vro.compute_digest(as_version=as_version)
else:
return None

Expand Down Expand Up @@ -210,9 +219,15 @@ def collapse_identifiable_values(obj: dict) -> dict:
return obj


def ga4gh_serialize(obj: BaseModel) -> Optional[bytes]:
def ga4gh_serialize(obj: BaseModel, as_version=None) -> Optional[bytes]:
"""
TODO find a way to output identify_all without the 'digest' fields on subobjects,
without traversing the whole tree again in collapse_identifiable_values.
Serializes an object for use in computed digest computation.
If a VRS version string is specified for the 'as_version' parameter,
the returned serialization follows the convention of the specified
VRS version.
"""
return obj.model_dump_json().encode("utf-8")
if as_version is None:
return obj.model_dump_json().encode("utf-8")
else:
return obj.ga4gh_serialize_as_version(as_version)
82 changes: 72 additions & 10 deletions src/ga4gh/vrs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import inspect
import sys
from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP
from ga4gh.core.pydantic import get_pydantic_root

from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer

Expand Down Expand Up @@ -229,16 +230,24 @@ def has_valid_ga4gh_id(self):
def has_valid_digest(self):
return bool(self.digest) # Pydantic constraint ensures digest field value is valid

def compute_digest(self, store=True) -> str:
def compute_digest(self, store=True, as_version=None) -> str:
"""A sha512t24u digest created using the VRS Computed Identifier algorithm.
Stores the digest in the object if store is True.
Stores the digest in the object if store is True. If 'as_version' is set to
a version string, other parameters are ignored and a digest returned
following the conventions of the VRS version indicated by 'as_version'.
"""
digest = sha512t24u(self.model_dump_json().encode("utf-8"))
if store:
self.digest = digest
if as_version is None:
digest = sha512t24u(self.model_dump_json().encode("utf-8"))
if store:
self.digest = digest
else:
try:
digest = sha512t24u(self.ga4gh_serialize_as_version(as_version).encode("utf-8"))
except AttributeError:
raise AttributeError('This class does not support prior version identifiers.')
return digest

def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) -> str:
def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as_version=None) -> str:
"""Sets and returns a GA4GH Computed Identifier for the object.
Overwrites the existing identifier if overwrite is True.
Expand All @@ -252,7 +261,14 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) ->
even when empty
Digests will be recalculated even if present if recompute is True.
If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.
"""
if as_version is not None:
return self.compute_ga4gh_identifier(as_version=as_version)

if in_place == 'default':
if self.id is None:
self.id = self.compute_ga4gh_identifier(recompute)
Expand All @@ -268,10 +284,19 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) ->
else:
return self.compute_ga4gh_identifier(recompute)

def compute_ga4gh_identifier(self, recompute=False):
"""Returns a GA4GH Computed Identifier"""
self.get_or_create_digest(recompute)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}'
def compute_ga4gh_identifier(self, recompute=False, as_version=None):
"""Returns a GA4GH Computed Identifier.
If 'as_version' is set to a version string, other parameters are
ignored and a computed identifier returned following the conventions
of the VRS version indicated by 'as_version'.
"""
if as_version is None:
self.get_or_create_digest(recompute)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}'
else:
digest = self.compute_digest(as_version=as_version)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.priorPrefix[as_version]}{GA4GH_PREFIX_SEP}{digest}'

def get_or_create_digest(self, recompute=False) -> str:
"""Sets and returns a sha512t24u digest of the GA4GH Identifiable Object, or creates
Expand Down Expand Up @@ -431,6 +456,29 @@ class SequenceLocation(_Ga4ghIdentifiableObject):
)
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")

def ga4gh_serialize_as_version(self, as_version):
"""This method will return a serialized string following the conventions for
SequenceLocation serialization as defined in the VRS version specified by 'as_version`."""
if as_version == '1.3':
out = list()
for value in [self.start,self.end]:
value = get_pydantic_root(value)
if isinstance(value, int):
result = f'{{"type":"Number","value":{value}}}'
elif isinstance(value, list):
if value[0] is None:
result = f'{{"comparator":"<=","type":"IndefiniteRange","value":{value[1]}}}'
elif value[1] is None:
result = f'{{"comparator":">=","type":"IndefiniteRange","value":{value[0]}}}'
else:
result = f'{{"max":{value[1]},"min":{value[0]},"type":"DefiniteRange"}}'
else:
raise ValueError(f'{value} is not int or list.')
out.append(result)
return f'{{"interval":{{"end":{out[1]},"start":{out[0]},"type":"SequenceInterval"}},"sequence_id":"{self.sequenceReference.refgetAccession.split(".")[1]}","type":"SequenceLocation"}}'
else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')

def get_refget_accession(self):
if isinstance(self.sequenceReference, SequenceReference):
return self.sequenceReference.refgetAccession
Expand All @@ -441,6 +489,7 @@ def get_refget_accession(self):

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'SL'
priorPrefix = {'1.3': 'VSL'}
keys = [
'end',
'sequenceReference',
Expand Down Expand Up @@ -474,8 +523,21 @@ class Allele(_VariationBase):
..., description='An expression of the sequence state'
)

def ga4gh_serialize_as_version(self, as_version):
"""This method will return a serialized string following the conventions for
Allele serialization as defined in the VRS version specified by 'as_version`."""
location_digest = self.location.compute_digest(as_version=as_version)
sequence = get_pydantic_root(self.state.sequence)
if sequence is None:
raise ValueError('State sequence attribute must be defined.')
if as_version == '1.3':
return f'{{"location":"{location_digest}","state":{{"sequence":"{sequence}","type":"LiteralSequenceExpression"}},"type":"Allele"}}'
else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'VA'
priorPrefix = {'1.3': 'VA'}
keys = [
'location',
'state',
Expand Down
2 changes: 1 addition & 1 deletion submodules/vrs
Submodule vrs updated 1 files
+25 −13 validation/models.yaml
15 changes: 15 additions & 0 deletions tests/validation/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,25 @@
from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify
from ga4gh.vrs import models

def ga4gh_1_3_identify(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_identify(*args, **kwargs)

def ga4gh_1_3_digest(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_digest(*args, **kwargs)

def ga4gh_1_3_serialize(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_serialize(*args, **kwargs)

fxs = {
"ga4gh_serialize": lambda o: ga4gh_serialize(o).decode() if ga4gh_serialize(o) else None,
"ga4gh_digest": ga4gh_digest,
"ga4gh_identify": ga4gh_identify,
"ga4gh_1_3_digest": ga4gh_1_3_digest,
"ga4gh_1_3_identify": ga4gh_1_3_identify,
"ga4gh_1_3_serialize": ga4gh_1_3_serialize
}

validation_fn = os.path.join(os.path.dirname(__file__), "data", "models.yaml")
Expand Down

0 comments on commit 3347873

Please sign in to comment.