From 06ec7f211755eba00c39657633274c63bdeb3b62 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Mon, 4 Nov 2024 15:35:15 -0500 Subject: [PATCH] progress: need to somehow get ID though --- src/ga4gh/core/identifiers.py | 6 +++++- src/ga4gh/vrs/models.py | 23 +++++++++++------------ tests/test_vrs.py | 12 ++++++++++++ 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/ga4gh/core/identifiers.py b/src/ga4gh/core/identifiers.py index 7b026243..f00d4ed4 100644 --- a/src/ga4gh/core/identifiers.py +++ b/src/ga4gh/core/identifiers.py @@ -113,7 +113,11 @@ def is_ga4gh_identifier(ir): return str(get_pydantic_root(ir)).startswith(NS_W_SEP) -def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | None = None) -> str | None: +def ga4gh_identify( + vro, + in_place: str = 'default', + as_version: PrevVrsVersion | None = None +) -> str | None: """Return the GA4GH digest-based id for the object, as a CURIE (string). Returns None if object is not identifiable. diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index 5ed3a1b2..687c54dd 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -197,17 +197,17 @@ class Syntax(str, Enum): SPDI = "spdi" -def _recurse_ga4gh_serialize(obj): +def _recurse_ga4gh_serialize(obj, store_digest: bool = True): if isinstance(obj, _Ga4ghIdentifiableObject): - return obj.get_or_create_digest() + return obj.get_or_create_digest(store=store_digest) elif isinstance(obj, _ValueObject): return obj.ga4gh_serialize() elif isinstance(obj, RootModel): - return _recurse_ga4gh_serialize(obj.model_dump()) + return _recurse_ga4gh_serialize(obj.model_dump(), store_digest) elif isinstance(obj, str): return obj elif isinstance(obj, list): - return [_recurse_ga4gh_serialize(x) for x in obj] + return [_recurse_ga4gh_serialize(x, store_digest) for x in obj] else: return obj @@ -220,11 +220,11 @@ class _ValueObject(Entity, ABC): def __hash__(self): return encode_canonical_json(self.ga4gh_serialize()).decode("utf-8").__hash__() - def ga4gh_serialize(self) -> Dict: + def ga4gh_serialize(self, store_digest: bool = True) -> Dict: out = OrderedDict() for k in self.ga4gh.keys: v = getattr(self, k) - out[k] = _recurse_ga4gh_serialize(v) + out[k] = _recurse_ga4gh_serialize(v, store_digest=store_digest) return out class ga4gh: @@ -266,7 +266,7 @@ def compute_digest(self, store: bool = True, as_version: PrevVrsVersion | None = returned following the conventions of the VRS version indicated by ``as_version_``. """ if as_version is None: - digest = sha512t24u(encode_canonical_json(self.ga4gh_serialize())) + digest = sha512t24u(encode_canonical_json(self.ga4gh_serialize(store_digest=store))) if store: self.digest = digest else: @@ -281,7 +281,6 @@ def get_or_create_ga4gh_identifier( in_place: str = 'default', recompute: bool = False, as_version: PrevVrsVersion | None = None, - store_digest: bool = True, ) -> str: """Sets and returns a GA4GH Computed Identifier for the object. Overwrites the existing identifier if overwrite is True. @@ -301,8 +300,8 @@ def get_or_create_ga4gh_identifier( :param recompute: :param as_version: If provided, other parameters are ignored and a computed identifier is returned following the conventions of the given VRS version. - :param store_digest: if ``False``, don't set the object's ``digest`` field. """ + store_digest = in_place != 'never' if as_version is not None: return self.compute_ga4gh_identifier(as_version=as_version) @@ -338,7 +337,7 @@ def compute_ga4gh_identifier( self.get_or_create_digest(recompute, store=store_digest) return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.prefix}{GA4GH_PREFIX_SEP}{self.digest}' else: - digest = self.compute_digest(as_version=as_version) + digest = self.compute_digest(store=store_digest, as_version=as_version) return f'{CURIE_NAMESPACE}{CURIE_SEP}{self.ga4gh.priorPrefix[as_version]}{GA4GH_PREFIX_SEP}{digest}' def get_or_create_digest(self, recompute: bool = False, store: bool = True) -> str: @@ -674,8 +673,8 @@ class CisPhasedBlock(_VariationBase): ) sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.") - def ga4gh_serialize(self) -> Dict: - out = _ValueObject.ga4gh_serialize(self) + def ga4gh_serialize(self, store_digest: bool = True) -> Dict: + out = _ValueObject.ga4gh_serialize(self, store_digest=store_digest) out["members"] = sorted(out["members"]) return out diff --git a/tests/test_vrs.py b/tests/test_vrs.py index 0ee47f16..cee78cc2 100644 --- a/tests/test_vrs.py +++ b/tests/test_vrs.py @@ -308,6 +308,8 @@ def test_compute_identifiers_when(): # when id property is missing vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): @@ -319,6 +321,8 @@ def test_compute_identifiers_when(): a["id"] = None vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): @@ -330,6 +334,8 @@ def test_compute_identifiers_when(): a["id"] = "" vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): @@ -341,6 +347,8 @@ def test_compute_identifiers_when(): a["id"] = syntax_invalid_id vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): @@ -352,6 +360,8 @@ def test_compute_identifiers_when(): a["id"] = syntax_valid_id vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): @@ -364,6 +374,8 @@ def test_compute_identifiers_when(): vo_a = models.Allele(**a) assert ga4gh_identify(vo_a, in_place='never') == correct_id assert ga4gh_identify(vo_a, in_place='never') is not correct_id + assert vo_a.digest is None + assert vo_a.location.digest is None with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): assert ga4gh_identify(vo_a, in_place='never') == correct_id assert ga4gh_identify(vo_a, in_place='never') is not correct_id