Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: VRS 1.3 digests may be computed for Alleles and Sequence Locations #427

Merged
merged 7 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions src/ga4gh/core/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def parse_ga4gh_identifier(ir):
raise ValueError(ir) from e


def ga4gh_identify(vro, in_place='default'):
def ga4gh_identify(vro, in_place='default', as_version=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably good to use an enum here if we are expected to support all previous versions (1.2, 1.1, 1.0) in future PRs. If not, we could probably just rename as_version to to_1_3_version or something and have it be a bool that defaults to False. Just so we're not hardcoding.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If supporting multiple versions and using enum, it might be good to fail fast in the beginning of the function and raise an exception if version is not supported.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree–intent is to support multiple versions down the road and we should leave that door open. Also agree with fail fast approach.

"""
Return the GA4GH digest-based id for the object, as a CURIE
(string). Returns None if object is not identifiable.
Expand All @@ -137,6 +137,10 @@ def ga4gh_identify(vro, in_place='default'):
- 'never': the vro.id field will not be edited in-place,
even when empty

If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.

TODO update example for VRS 2.0
>>> import ga4gh.vrs
>>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822)
Expand All @@ -158,18 +162,20 @@ def ga4gh_identify(vro, in_place='default'):
do_compute = not vro.has_valid_ga4gh_id()

if do_compute:
obj_id = vro.get_or_create_ga4gh_identifier(in_place)
obj_id = vro.get_or_create_ga4gh_identifier(in_place, as_version=as_version)

return obj_id

return None


def ga4gh_digest(vro: BaseModel, overwrite=False):
def ga4gh_digest(vro: BaseModel, overwrite=False, as_version=None):
"""
Return the GA4GH digest for the object.

do_compact: bool - true if object compaction should be performed during serialization
If 'as_version' is set to a version string, other parameters
are ignored and a digest returned following the conventions of
the VRS version indicated by 'as_version'.

TODO update example

Expand All @@ -181,7 +187,10 @@ def ga4gh_digest(vro: BaseModel, overwrite=False):

"""
if vro.is_ga4gh_identifiable(): # Only GA4GH identifiable objects are GA4GH digestible
return vro.get_or_create_digest(overwrite)
if as_version is None:
return vro.get_or_create_digest(overwrite)
else:
return vro.compute_digest(as_version=as_version)
else:
return None

Expand Down Expand Up @@ -210,9 +219,15 @@ def collapse_identifiable_values(obj: dict) -> dict:
return obj


def ga4gh_serialize(obj: BaseModel) -> Optional[bytes]:
def ga4gh_serialize(obj: BaseModel, as_version=None) -> Optional[bytes]:
"""
TODO find a way to output identify_all without the 'digest' fields on subobjects,
without traversing the whole tree again in collapse_identifiable_values.
Serializes an object for use in computed digest computation.

If a VRS version string is specified for the 'as_version' parameter,
the returned serialization follows the convention of the specified
VRS version.
"""
return obj.model_dump_json().encode("utf-8")
if as_version is None:
return obj.model_dump_json().encode("utf-8")
else:
return obj.ga4gh_serialize_as_version(as_version)
83 changes: 73 additions & 10 deletions src/ga4gh/vrs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import inspect
import sys
from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP
from ga4gh.core.pydantic import get_pydantic_root

from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer

Expand Down Expand Up @@ -229,16 +230,24 @@ def has_valid_ga4gh_id(self):
def has_valid_digest(self):
return bool(self.digest) # Pydantic constraint ensures digest field value is valid

def compute_digest(self, store=True) -> str:
def compute_digest(self, store=True, as_version=None) -> str:
"""A sha512t24u digest created using the VRS Computed Identifier algorithm.
Stores the digest in the object if store is True.
Stores the digest in the object if store is True. If 'as_version' is set to
a version string, other parameters are ignored and a digest returned
following the conventions of the VRS version indicated by 'as_version'.
"""
digest = sha512t24u(self.model_dump_json().encode("utf-8"))
if store:
self.digest = digest
if as_version is None:
digest = sha512t24u(self.model_dump_json().encode("utf-8"))
if store:
self.digest = digest
else:
try:
digest = sha512t24u(self.ga4gh_serialize_as_version(as_version).encode("utf-8"))
except AttributeError:
raise AttributeError('This class does not support prior version identifiers.')
return digest

def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) -> str:
def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as_version=None) -> str:
"""Sets and returns a GA4GH Computed Identifier for the object.
Overwrites the existing identifier if overwrite is True.

Expand All @@ -252,7 +261,14 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) ->
even when empty

Digests will be recalculated even if present if recompute is True.

If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.
"""
if as_version is not None:
return self.compute_ga4gh_identifier(as_version=as_version)

if in_place == 'default':
if self.id is None:
self.id = self.compute_ga4gh_identifier(recompute)
Expand All @@ -268,10 +284,19 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) ->
else:
return self.compute_ga4gh_identifier(recompute)

def compute_ga4gh_identifier(self, recompute=False):
"""Returns a GA4GH Computed Identifier"""
self.get_or_create_digest(recompute)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}'
def compute_ga4gh_identifier(self, recompute=False, as_version=None):
"""Returns a GA4GH Computed Identifier.

If 'as_version' is set to a version string, other parameters are
ignored and a computed identifier returned following the conventions
of the VRS version indicated by 'as_version'.
"""
if as_version is None:
self.get_or_create_digest(recompute)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}'
else:
digest = self.compute_digest(as_version=as_version)
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.priorPrefix[as_version]}{GA4GH_PREFIX_SEP}{digest}'

def get_or_create_digest(self, recompute=False) -> str:
"""Sets and returns a sha512t24u digest of the GA4GH Identifiable Object, or creates
Expand Down Expand Up @@ -431,6 +456,29 @@ class SequenceLocation(_Ga4ghIdentifiableObject):
)
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")

def ga4gh_serialize_as_version(self, as_version):
"""This method will return a serialized string following the conventions for
SequenceLocation serialization as defined in the VRS version specified by 'as_version`."""
if as_version == '1.3':
out = list()
for value in [self.start,self.end]:
value = get_pydantic_root(value)
if isinstance(value, int):
result = f'{{"type":"Number","value":{value}}}'
elif isinstance(value, list):
if value[0] is None:
result = f'{{"comparator":"<=","type":"IndefiniteRange","value":{value[1]}}}'
elif value[1] is None:
result = f'{{"comparator":">=","type":"IndefiniteRange","value":{value[0]}}}'
else:
result = f'{{"max":{value[1]},"min":{value[0]},"type":"DefiniteRange"}}'
else:
raise ValueError(f'{value} is not int or list.')
out.append(result)
return f'{{"interval":{{"end":{out[1]},"start":{out[0]},"type":"SequenceInterval"}},"sequence_id":"{self.sequenceReference.refgetAccession.split('.')[1]}","type":"SequenceLocation"}}'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes that sequenceReference is not null and that it is type SequenceReference. We should handle cases where sequenceReference is null and cases where sequenceReference is an IRI.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, agreed.

else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')

def get_refget_accession(self):
if isinstance(self.sequenceReference, SequenceReference):
return self.sequenceReference.refgetAccession
Expand All @@ -441,6 +489,7 @@ def get_refget_accession(self):

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'SL'
priorPrefix = {'1.3': 'VSL'}
keys = [
'end',
'sequenceReference',
Expand Down Expand Up @@ -474,8 +523,21 @@ class Allele(_VariationBase):
..., description='An expression of the sequence state'
)

def ga4gh_serialize_as_version(self, as_version):
"""This method will return a serialized string following the conventions for
Allele serialization as defined in the VRS version specified by 'as_version`."""
location_digest = self.location.compute_digest(as_version=as_version)
sequence = get_pydantic_root(self.state.sequence)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to check that state is a LiteralSequenceExpression and raise exception if not. LengthExpression does not have a sequence field and not sure if we want ReferenceLengthExpression to be converted to LSE in 1.3

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LSE or RLE is okay here; both have a sequence field, and for small variants the RLE sequence is populated by default. However, I agree that we want to do some processing here as a separate PR; ideally we would implement the RLE to LSE function (it should work like this) and just apply that in the case this is an RLE without a defined sequence.

if sequence is None:
raise ValueError('State sequence attribute must be defined.')
if as_version == '1.3':
return f'{{"location":"{location_digest}","state":{{"sequence":"{sequence}","type":"LiteralSequenceExpression"}},"type":"Allele"}}'
else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'VA'
priorPrefix = {'1.3': 'VA'}
keys = [
'location',
'state',
Expand All @@ -502,6 +564,7 @@ def ga4gh_serialize(self) -> Dict:

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'CPB'
priorPrefix = {'1.3': 'VH'}
ahwagner marked this conversation as resolved.
Show resolved Hide resolved
keys = [
'members',
'type'
Expand Down
2 changes: 1 addition & 1 deletion submodules/vrs
Submodule vrs updated 1 files
+25 −13 validation/models.yaml
15 changes: 15 additions & 0 deletions tests/validation/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,25 @@
from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify
from ga4gh.vrs import models

def ga4gh_1_3_identify(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_identify(*args, **kwargs)

def ga4gh_1_3_digest(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_digest(*args, **kwargs)

def ga4gh_1_3_serialize(*args, **kwargs):
kwargs['as_version'] = '1.3'
return ga4gh_serialize(*args, **kwargs)

fxs = {
"ga4gh_serialize": lambda o: ga4gh_serialize(o).decode() if ga4gh_serialize(o) else None,
"ga4gh_digest": ga4gh_digest,
"ga4gh_identify": ga4gh_identify,
"ga4gh_1_3_digest": ga4gh_1_3_digest,
"ga4gh_1_3_identify": ga4gh_1_3_identify,
"ga4gh_1_3_serialize": ga4gh_1_3_serialize
}

validation_fn = os.path.join(os.path.dirname(__file__), "data", "models.yaml")
Expand Down
Loading