-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: VRS 1.3 digests may be computed for Alleles and Sequence Locations #427
Changes from 5 commits
7f0fefe
59e0818
96893c9
06bc1b5
2e55240
0e7dc40
d1ce82f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
import inspect | ||
import sys | ||
from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP | ||
from ga4gh.core.pydantic import get_pydantic_root | ||
|
||
from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer | ||
|
||
|
@@ -229,16 +230,24 @@ def has_valid_ga4gh_id(self): | |
def has_valid_digest(self): | ||
return bool(self.digest) # Pydantic constraint ensures digest field value is valid | ||
|
||
def compute_digest(self, store=True) -> str: | ||
def compute_digest(self, store=True, as_version=None) -> str: | ||
"""A sha512t24u digest created using the VRS Computed Identifier algorithm. | ||
Stores the digest in the object if store is True. | ||
Stores the digest in the object if store is True. If 'as_version' is set to | ||
a version string, other parameters are ignored and a digest returned | ||
following the conventions of the VRS version indicated by 'as_version'. | ||
""" | ||
digest = sha512t24u(self.model_dump_json().encode("utf-8")) | ||
if store: | ||
self.digest = digest | ||
if as_version is None: | ||
digest = sha512t24u(self.model_dump_json().encode("utf-8")) | ||
if store: | ||
self.digest = digest | ||
else: | ||
try: | ||
digest = sha512t24u(self.ga4gh_serialize_as_version(as_version).encode("utf-8")) | ||
except AttributeError: | ||
raise AttributeError('This class does not support prior version identifiers.') | ||
return digest | ||
|
||
def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) -> str: | ||
def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as_version=None) -> str: | ||
"""Sets and returns a GA4GH Computed Identifier for the object. | ||
Overwrites the existing identifier if overwrite is True. | ||
|
||
|
@@ -252,7 +261,14 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) -> | |
even when empty | ||
|
||
Digests will be recalculated even if present if recompute is True. | ||
|
||
If 'as_version' is set to a version string, other parameters are | ||
ignored and an identifier returned following the conventions of | ||
the VRS version indicated by 'as_version'. | ||
""" | ||
if as_version is not None: | ||
return self.compute_ga4gh_identifier(as_version=as_version) | ||
|
||
if in_place == 'default': | ||
if self.id is None: | ||
self.id = self.compute_ga4gh_identifier(recompute) | ||
|
@@ -268,10 +284,19 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False) -> | |
else: | ||
return self.compute_ga4gh_identifier(recompute) | ||
|
||
def compute_ga4gh_identifier(self, recompute=False): | ||
"""Returns a GA4GH Computed Identifier""" | ||
self.get_or_create_digest(recompute) | ||
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}' | ||
def compute_ga4gh_identifier(self, recompute=False, as_version=None): | ||
"""Returns a GA4GH Computed Identifier. | ||
|
||
If 'as_version' is set to a version string, other parameters are | ||
ignored and a computed identifier returned following the conventions | ||
of the VRS version indicated by 'as_version'. | ||
""" | ||
if as_version is None: | ||
self.get_or_create_digest(recompute) | ||
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}' | ||
else: | ||
digest = self.compute_digest(as_version=as_version) | ||
return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.priorPrefix[as_version]}{GA4GH_PREFIX_SEP}{digest}' | ||
|
||
def get_or_create_digest(self, recompute=False) -> str: | ||
"""Sets and returns a sha512t24u digest of the GA4GH Identifiable Object, or creates | ||
|
@@ -431,6 +456,29 @@ class SequenceLocation(_Ga4ghIdentifiableObject): | |
) | ||
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.") | ||
|
||
def ga4gh_serialize_as_version(self, as_version): | ||
"""This method will return a serialized string following the conventions for | ||
SequenceLocation serialization as defined in the VRS version specified by 'as_version`.""" | ||
if as_version == '1.3': | ||
out = list() | ||
for value in [self.start,self.end]: | ||
value = get_pydantic_root(value) | ||
if isinstance(value, int): | ||
result = f'{{"type":"Number","value":{value}}}' | ||
elif isinstance(value, list): | ||
if value[0] is None: | ||
result = f'{{"comparator":"<=","type":"IndefiniteRange","value":{value[1]}}}' | ||
elif value[1] is None: | ||
result = f'{{"comparator":">=","type":"IndefiniteRange","value":{value[0]}}}' | ||
else: | ||
result = f'{{"max":{value[1]},"min":{value[0]},"type":"DefiniteRange"}}' | ||
else: | ||
raise ValueError(f'{value} is not int or list.') | ||
out.append(result) | ||
return f'{{"interval":{{"end":{out[1]},"start":{out[0]},"type":"SequenceInterval"}},"sequence_id":"{self.sequenceReference.refgetAccession.split('.')[1]}","type":"SequenceLocation"}}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumes that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, agreed. |
||
else: | ||
raise ValueError(f'Serializing as version {as_version} not supported for this class.') | ||
|
||
def get_refget_accession(self): | ||
if isinstance(self.sequenceReference, SequenceReference): | ||
return self.sequenceReference.refgetAccession | ||
|
@@ -441,6 +489,7 @@ def get_refget_accession(self): | |
|
||
class ga4gh(_Ga4ghIdentifiableObject.ga4gh): | ||
prefix = 'SL' | ||
priorPrefix = {'1.3': 'VSL'} | ||
keys = [ | ||
'end', | ||
'sequenceReference', | ||
|
@@ -474,8 +523,21 @@ class Allele(_VariationBase): | |
..., description='An expression of the sequence state' | ||
) | ||
|
||
def ga4gh_serialize_as_version(self, as_version): | ||
"""This method will return a serialized string following the conventions for | ||
Allele serialization as defined in the VRS version specified by 'as_version`.""" | ||
location_digest = self.location.compute_digest(as_version=as_version) | ||
sequence = get_pydantic_root(self.state.sequence) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may want to check that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LSE or RLE is okay here; both have a |
||
if sequence is None: | ||
raise ValueError('State sequence attribute must be defined.') | ||
if as_version == '1.3': | ||
return f'{{"location":"{location_digest}","state":{{"sequence":"{sequence}","type":"LiteralSequenceExpression"}},"type":"Allele"}}' | ||
else: | ||
raise ValueError(f'Serializing as version {as_version} not supported for this class.') | ||
|
||
class ga4gh(_Ga4ghIdentifiableObject.ga4gh): | ||
prefix = 'VA' | ||
priorPrefix = {'1.3': 'VA'} | ||
keys = [ | ||
'location', | ||
'state', | ||
|
@@ -502,6 +564,7 @@ def ga4gh_serialize(self) -> Dict: | |
|
||
class ga4gh(_Ga4ghIdentifiableObject.ga4gh): | ||
prefix = 'CPB' | ||
priorPrefix = {'1.3': 'VH'} | ||
ahwagner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
keys = [ | ||
'members', | ||
'type' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably good to use an enum here if we are expected to support all previous versions (1.2, 1.1, 1.0) in future PRs. If not, we could probably just rename
as_version
toto_1_3_version
or something and have it be a bool that defaults toFalse
. Just so we're not hardcoding.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If supporting multiple versions and using enum, it might be good to fail fast in the beginning of the function and raise an exception if version is not supported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree–intent is to support multiple versions down the road and we should leave that door open. Also agree with fail fast approach.