From 48cb1430053b6363e6cbf1fb4e514f97b47bff5e Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 10 Jul 2024 10:21:49 -0400 Subject: [PATCH 1/4] feat: add make command (`nbready`) to create jupyter notebook venv (#419) close #403 --- Makefile | 8 ++++++++ notebooks/getting_started/README.md | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 6269550e..5d9c5ec0 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,14 @@ devready: @echo '### Do not forget to `source ${VEDIR}/bin/activate` to use this environment ###' @echo '#################################################################################' +#=> nbready: create venv, install prerequisites, install pkg in notebook mode +.PHONY: nbready +nbready: + make ${VEDIR} && source ${VEDIR}/bin/activate && pip install -e '.[extras,notebooks]' + @echo '#################################################################################' + @echo '### Do not forget to `source ${VEDIR}/bin/activate` to use this environment ###' + @echo '#################################################################################' + #=> install: install package .PHONY: install-extras install-extras: diff --git a/notebooks/getting_started/README.md b/notebooks/getting_started/README.md index 1a95de6f..bd766616 100644 --- a/notebooks/getting_started/README.md +++ b/notebooks/getting_started/README.md @@ -16,8 +16,8 @@ The following software packages must exist in your execution environment before From a terminal window, run the following commands: * git clone --recurse-submodules https://github.com/ga4gh/vrs-python * cd vrs-python -* make devready -* source venv/3.10/bin/activate +* make nbready +* source venv/3.12/bin/activate * cd notebooks/getting_started * jupyter notebook notebook_name.ipynb From 506940e36d3b88f9c609dba2c2d2ec0819732b24 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 10 Jul 2024 10:30:17 -0400 Subject: [PATCH 2/4] feat!: update vrs/common models (#417) close #409 , #116 * updates vrs/gks-common submodules * Pydantic models updated to reflect these changes * Removed `use_enum_values` from Pydantic config. We didn't use this in all models, so not sure if there was a purpose for it. * Removed duplicate vrs test --- src/ga4gh/core/_internal/models.py | 290 +++++++++++++++------ src/ga4gh/vrs/_internal/models.py | 208 ++++++++------- src/ga4gh/vrs/normalize.py | 4 +- submodules/vrs | 2 +- tests/test_vrs.py | 366 +++++++++++++++++++++++--- tests/test_vrs2.py | 397 ----------------------------- 6 files changed, 650 insertions(+), 617 deletions(-) delete mode 100644 tests/test_vrs2.py diff --git a/src/ga4gh/core/_internal/models.py b/src/ga4gh/core/_internal/models.py index c94b0c18..03d03b67 100644 --- a/src/ga4gh/core/_internal/models.py +++ b/src/ga4gh/core/_internal/models.py @@ -10,20 +10,27 @@ * `import ga4gh.core`, and refer to models using the fully-qualified module name, e.g., `ga4gh.core.common_models.Gene` """ -from typing import Any, Dict, List, Literal, Optional, Union, Annotated +from __future__ import annotations +import datetime +from typing import Any, Dict, Literal, Annotated, Optional, Union, List from enum import Enum -from pydantic import BaseModel, ConfigDict, Field, RootModel, StringConstraints, model_serializer +from pydantic import BaseModel, Field, RootModel, StringConstraints, constr, field_validator, model_serializer, model_validator from ga4gh.core import GA4GH_IR_REGEXP - ######################################### -# gks-common core +# GKS Common Abstract Entity & Utility Class Definitions ######################################### +class AgentSubtype(str, Enum): + """Define constraints for agent subtype""" + + PERSON = "person" + ORGANIZATION = "organization" + COMPUTER = "computer" -class Relation(Enum): +class Relation(str, Enum): """A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS). """ @@ -35,6 +42,26 @@ class Relation(Enum): RELATED_MATCH = 'relatedMatch' +class Syntax(str, Enum): + """The syntax used to describe the variation. The value should be one of the + supported syntaxes. + """ + + HGVS_C = "hgvs.c" + HGVS_P = "hgvs.p" + HGVS_G = "hgvs.g" + HGVS_M = "hgvs.m" + HGVS_N = "hgvs.n" + HGVS_R = "hgvs.r" + HGVS_ISCN = "iscn" + GNOMAD = "gnomad" + SPDI = "spdi" + +######################################### +# GKS Common Abstract Utility Classes +# These do not inherit from Entity and are not typed explicitly +######################################### + class Code(RootModel): """Indicates that the value is taken from a set of controlled strings defined elsewhere. Technically, a code is restricted to a string which has at least one @@ -75,32 +102,91 @@ def ga4gh_serialize(self): ) -class Extension(BaseModel): - """The Extension class provides VODs with a means to extend descriptions with other - attributes unique to a content provider. These extensions are not expected to be - natively understood under VRSATILE, but may be used for pre-negotiated exchange of - message attributes when needed. +class RecordMetadata(BaseModel): + """A reusable structure that encapsulates provenance metadata about a serialized + data record or object in a particular dataset (as opposed to provenance about the + real world entity this record or object represents). + """ + + recordIdentifier: Optional[str] = Field(None, description="The identifier of the data record or object described in this RecordMetadata object.") + recordVersion: Optional[str] = Field(None, description="The version number of the record-level artifact the object describes.") + derivedFrom: Optional[str] = Field(None, description="Another data record from which the record described here was derived, through a data ingest and/or transformation process. Value should be a string representing the identifier of the source record.") + dateRecordCreated: Optional[str] = Field(None, description="The date the record was initially created.") + contributions: Optional[List[Contribution]] = Field(None, description="Describes specific contributions made by an human or software agent to the creation, modification, or administrative management of a data record or object.") + + +class Coding(BaseModel): + """A structured representation of a code for a defined concept in a terminology or + code system. """ - model_config = ConfigDict( - extra='allow', + + label: Optional[str] = Field( + None, + description='The human-readable name for the coded concept, as defined by the code system.' + ) + system: str = Field( + ..., + description="The terminology/code system that defined the code. May be reported as a free-text name (e.g. 'Sequence Ontology'), but it is preferable to provide a uri/url for the system. When the 'code' is reported as a CURIE, the 'system' should be reported as the uri that the CURIE's prefix expands to (e.g. 'http://purl.obofoundry.org/so.owl/' for the Sequence Ontology)." + ) + version: Optional[str] = Field( + None, + description='Version of the terminology or code system that provided the code.' + ) + code: Code = Field( + ..., + description="A symbol uniquely identifying the concept, as in a syntax defined by the code system. CURIE format is preferred where possible (e.g. 'SO:0000704' is the CURIE form of the Sequence Ontology code for 'gene')." ) - type: Literal['Extension'] = Field('Extension', description='MUST be "Extension".') - name: str = Field(..., description='A name for the Extension') + + +class ConceptMapping(BaseModel): + """A mapping to a concept in a terminology or code system.""" + + coding: Coding = Field(..., description="A structured representation of a code for a defined concept in a terminology or code system.") + relation: Relation = Field(..., description="A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS).") + + +class Extension(BaseModel): + """The Extension class provides entities with a means to include additional + attributes that are outside of the specified standard but needed by a given content + provider or system implementer. These extensions are not expected to be natively + understood, but may be used for pre-negotiated exchange of message attributes + between systems. + """ + + name: str = Field(..., description='A name for the Extension. Should be indicative of its meaning and/or the type of information it value represents.') value: Optional[Union[float, str, bool, Dict[str, Any], List[Any]]] = Field( - None, description='Any primitive or structured object' + None, description='The value of the Extension - can be any primitive or structured object' ) + description: Optional[str] = Field(None, description="A description of the meaning or utility of the Extension, to explain the type of information it is meant to hold.") + + +class Expression(BaseModel): + """Representation of a variation by a specified nomenclature or syntax for a + Variation object. Common examples of expressions for the description of molecular + variation include the HGVS and ISCN nomenclatures. + """ + + syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.") + value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.") + syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.") + + +######################################### +# GKS Common Abstract Entity Class Definitions +######################################### class _Entity(BaseModel): - """Entity is the root class of `core` classes model - those that have identifiers - and other general metadata like labels, xrefs, urls, descriptions, etc. All core - classes descend from and inherit its attributes. + """Entity is the root class of the 'gks-common' core information model classes - + those that have identifiers and other general metadata like labels, xrefs, urls, + descriptions, etc. All common classes descend from and inherit its attributes. """ id: Optional[str] = Field( None, - description="The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE)." + description="The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE)." ) + type: str label: Optional[str] = Field( None, description='A primary label for the entity.' @@ -109,64 +195,132 @@ class _Entity(BaseModel): None, description='A free-text description of the entity.' ) - extensions: Optional[List[Extension]] = None + alternativeLabels: Optional[List[str]] = Field(None, description="Alternative name(s) for the Entity.") + extensions: Optional[List[Extension]] = Field(None, description="A list of extensions to the entity. Extensions are not expected to be natively understood, but may be used for pre-negotiated exchange of message attributes between systems.") + +class _DomainEntity(_Entity): + """An Entity that is specific to a particular biomedical domain such as disease, + therapeutics, or genes. Domain Entities are considered as 'concept-level' entities, + as opposed to particular instances. e.g. 'Lung Cancer', not 'patient123's lung + cancer'. Or 'Erlotinib', not the particular doses given to a patient on a specific + occasion. + """ + mappings: Optional[List[ConceptMapping]] = Field(None, description="A list of mappings to concepts in terminologies or code systems. Each mapping should include a coding and a relation.") -class Coding(BaseModel): - """a concept codified by a terminology system.""" - label: Optional[str] = Field( - None, - description='A primary label for the coding.' - ) - system: str = Field( - ..., - description='Identity of the terminology system.' - ) - version: Optional[str] = Field( - None, - description='Version of the terminology system.' - ) - code: Code = Field( - ..., - description='Symbol in syntax defined by the terminology system.' - ) +class Agent(_Entity): + """An autonomous actor (person, organization, or computational agent) that bears + some form of responsibility for an activity taking place, for the existence of an + entity, or for another agent's activity. + """ + type: Literal["Agent"] = Field("Agent", description="MUST be 'Agent'.") + name: Optional[str] = Field(None, description="The descriptive name of the agent.") + subtype: Optional[AgentSubtype] = Field(None, description="A more specific type of agent the agent represents.") -class Mapping(_Entity): - """A mapping to a concept in a terminology system.""" - model_config = ConfigDict( - use_enum_values=True - ) - coding: Coding - relation: Relation = Field( - ..., - description='A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS).' - ) +class Activity(_Entity): + """An action or set of actions performed by an agent, that occurs over a period of + time. Activities may use, generate, modify, move, or destroy one or more entities. + """ + subtype: Optional[Coding] = Field(None, description="A more specific type of activity that an Activity object may represent.") + date: Optional[str] = Field(None, description="The date that the Activity was completed. The date SHOULD be formatted as a date string in ISO format 'YYYY-MM-DD'.") + performedBy: Optional[List[Agent]] = Field(None, description="An Agent who contributed to executing the Activity.") + specifiedBy: Optional[List[Method]] = Field(None, description="A method that was followed in performing an Activity, that describes how it was executed.") + + @field_validator("date") + @classmethod + def date_format(cls, v: Optional[str]) -> Optional[str]: + """Check that date is YYYY-MM-DD format""" + if v: + valid_format = "%Y-%m-%d" + + try: + datetime.datetime.strptime(v, valid_format).replace( + tzinfo=datetime.timezone.utc + ).strftime(valid_format) + except ValueError as e: + msg = "`date` must use YYYY-MM-DD format" + raise ValueError(msg) from e + return v + + +class Contribution(Activity): + """An action taken by an agent in contributing to the creation, modification, + assessment, or deprecation of a particular entity (e.g. a Statement, EvidenceLine, + DataItem, Publication, etc.) + """ -class _MappableEntity(_Entity): - """an Entity that is mappable to codings in other terminology systems.""" + type: Literal["Contribution"] = "Contribution" + contributor: Optional[Agent] = Field(None, description="The agent that made the contribution.") + contributionMadeTo: Optional[_InformationEntity] = Field(None, description="The artifact toward which the contribution was made.") # noqa: N815 + activityType: Optional[Coding] = Field(None, description="SHOULD describe a concept descending from the Contributor Role Ontology.") - mappings: Optional[List[Mapping]] = None + @model_validator(mode="before") + def handle_extends_prop(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Handle extends properties by renaming fields + :param values: Input values to process + :return: Processed values with extended properties renamed + """ + if "performedBy" in values: + values["contributor"] = values.pop("performedBy") + return values -class _DomainEntity(_MappableEntity): - """An Entity that is specific to a particular biomedical domain such as disease, - therapeutics, or genes. + +class _InformationEntity(_Entity): + """Information Entities are abstract (non-physical) entities that are about + something (i.e. they carry information about things in the real world). """ - type: str - aliases: Optional[List[str]] = Field( + id: str + specifiedBy: Optional[Union[Method, IRI]] = Field(None, description="A `Method` that describes all or part of the process through which the information was generated.") + contributions: Optional[List[Contribution]] = Field(None, description="A list of `Contribution` objects that describe the activities performed by agents upon this entity.") + isReportedIn: Optional[List[Union[Document, IRI]]] = Field(None, description="A document in which the information content is expressed.") + dateAuthored: Optional[str] = Field(None, description="Indicates when the information content expressed in the Information Entity was generated.") + derivedFrom: Optional[List[_InformationEntity]] = Field(None, description="Another Information Entity from which this Information Entity is derived, in whole or in part.") + recordMetadata: Optional[RecordMetadata] = Field(None, description="Metadata that applies to a specific concrete record of information as encoded in a particular system.") + +class Document(_InformationEntity): + """a representation of a physical or digital document""" + + type: Literal["Document"] = "Document" + subtype: Optional[Coding] = Field( + None, description="A more specific type for the document (e.g. a publication, patent, pathology report)" + ) + title: Optional[str] = Field(None, description="The title of the Document") + url: Optional[constr(pattern="^(https?|s?ftp)://")] = Field( + None, description="A URL at which the document may be retrieved." + ) + doi: Optional[constr(pattern="^10.(\\d+)(\\.\\d+)*\\/[\\w\\-\\.]+")] = Field( + None, + description="A `Digital Object Identifier _` for the document.", + ) + pmid: Optional[int] = Field( + None, + description="A `PubMed unique identifier `_.", + ) + + +class Method(_InformationEntity): + """A set of instructions that specify how to achieve some objective (e.g. + experimental protocols, curation guidelines, rule sets, etc.) + """ + + type: Literal["Method"] = Field("Method", description="MUST be 'Method'.") + isReportedIn: Optional[Union[Document, IRI]] = None # noqa: N815 + subtype: Optional[Coding] = Field( None, - description='Aliases are alternate labels for a Domain Entity.' + description="A more specific type of entity the method represents (e.g. Variant Interpretation Guideline, Experimental Protocol)", ) + license: Optional[str] = Field(None, description="A particular license that dictates legal permissions for how a published method (e.g. an experimental protocol, workflow specification, curation guideline) can be used.") ######################################### -# gks-common conditions +# GKS Common Domain Entities ######################################### class Phenotype(_DomainEntity): @@ -205,18 +359,13 @@ class TraitSet(_DomainEntity): class Condition(RootModel): """A disease or other medical disorder.""" - root: Union[Disease, Phenotype, TraitSet] = Field( + root: Union[TraitSet, Disease, Phenotype] = Field( ..., json_schema_extra={'description': 'A disease or other medical disorder.'}, discriminator='type', ) -######################################### -# gks-common therapeutics -######################################### - - class TherapeuticAction(_DomainEntity): """A therapeutic action taken that is intended to alter or stop a pathologic process.""" @@ -258,11 +407,7 @@ class CombinationTherapy(_DomainEntity): 'CombinationTherapy', description='MUST be "CombinationTherapy".' ) - components: List[Union[ - TherapeuticSubstituteGroup, - TherapeuticAction, - TherapeuticAgent - ]] = Field( + components: List[Union[TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent]] = Field( ..., description='The individual therapeutic procedure components that constitute the combination therapy.', min_length=2 @@ -274,18 +419,13 @@ class TherapeuticProcedure(RootModel): intended to alter or stop a pathologic process. """ - root: Union[CombinationTherapy, TherapeuticAction, TherapeuticAgent, TherapeuticSubstituteGroup] = Field( + root: Union[CombinationTherapy, TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent] = Field( ..., json_schema_extra={'description': 'An action or administration of therapeutic agents to produce an effect that is intended to alter or stop a pathologic process.'}, discriminator='type', ) -######################################### -# gks-common therapeutics -######################################### - - class Gene(_DomainEntity): """A basic physical and functional unit of heredity.""" diff --git a/src/ga4gh/vrs/_internal/models.py b/src/ga4gh/vrs/_internal/models.py index b636f498..66064e75 100644 --- a/src/ga4gh/vrs/_internal/models.py +++ b/src/ga4gh/vrs/_internal/models.py @@ -9,15 +9,7 @@ * `import ga4gh.vrs`, and refer to models using the fully-qualified module name, e.g., `ga4gh.vrs.models.Allele` - -New pydantic-based version - -Pydantic classes bootstrapped with: -sed -i.bkp 's/$defs/definitions/g' merged.json -V1 pydantic: datamodel-codegen --input submodules/vrs/schema/merged.json --input-file-type jsonschema --output models_merged2.py -V2 pydantic: datamodel-codegen --input submodules/vrs/schema/merged.json --input-file-type jsonschema --output models.py --output-model-type pydantic_v2.BaseModel --allow-extra-fields """ - from typing import List, Literal, Optional, Union, Dict, Annotated from collections import OrderedDict from enum import Enum @@ -25,13 +17,13 @@ import sys from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP -from pydantic import BaseModel, ConfigDict, Field, RootModel, StringConstraints, model_serializer +from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer from ga4gh.core._internal.pydantic import ( is_ga4gh_identifiable, getattr_in ) -from ga4gh.core._internal.models import IRI, _Entity +from ga4gh.core._internal.models import IRI, Expression, _DomainEntity def flatten(vals): @@ -133,28 +125,17 @@ def pydantic_class_refatt_map(): class_keys) -class Syntax(Enum): - """Define constraints for syntax""" - - HGVS_C = "hgvs.c" - HGVS_P = "hgvs.p" - HGVS_G = "hgvs.g" - HGVS_M = "hgvs.m" - HGVS_N = "hgvs.n" - HGVS_R = "hgvs.r" - ISCN = "iscn" - GNOMAD = "gnomad" - SPDI = "spdi" - - -class ResidueAlphabet(Enum): - """Define constraints for residue alphabet""" +class ResidueAlphabet(str, Enum): + """The interpretation of the character codes referred to by the refget accession, + where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid + character set. + """ AA = 'aa' NA = 'na' -class CopyChange(Enum): +class CopyChange(str, Enum): """Define constraints for copy change""" EFO_0030069 = 'efo:0030069' @@ -182,7 +163,7 @@ def _recurse_ga4gh_serialize(obj): return obj -class _ValueObject(_Entity): +class _ValueObject(_DomainEntity): """A contextual value whose equality is based on value, not identity. See https://en.wikipedia.org/wiki/Value_object for more on Value Objects. """ @@ -209,10 +190,10 @@ def is_ga4gh_identifiable(): class _Ga4ghIdentifiableObject(_ValueObject): """A contextual value object for which a GA4GH computed identifier can be created. All GA4GH Identifiable Objects may have computed digests from the VRS Computed - Identifier algorithm.""" + Identifier algorithm. + """ type: str - digest: Optional[Annotated[str, StringConstraints(pattern=r'^[0-9A-Za-z_\-]{32}$')]] = Field( None, description='A sha512t24u digest created using the VRS Computed Identifier algorithm.', @@ -286,19 +267,6 @@ class ga4gh(_ValueObject.ga4gh): prefix: str -class Expression(BaseModel): - """Representation of a variation by a specified nomenclature or syntax for a - Variation object. Common examples of expressions for the description of molecular - variation include the HGVS and ISCN nomenclatures. - """ - model_config = ConfigDict( - use_enum_values=True - ) - - syntax: Syntax - value: str - syntax_version: Optional[str] = None - ######################################### # vrs numerics, comparators, and ranges ######################################### @@ -332,7 +300,7 @@ class Residue(RootModel): class SequenceString(RootModel): - """A character string of Residues that represents a biological sequence using the + """A character string of `Residues` that represents a biological sequence using the conventional sequence order (5'-to-3' for nucleic acid sequences, and amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes are permitted in Sequence Strings. @@ -341,7 +309,7 @@ class SequenceString(RootModel): root: Annotated[str, StringConstraints(pattern=r'^[A-Z*\-]*$')] = Field( ..., json_schema_extra={ - 'description': 'A character string of Residues that represents a biological sequence using the conventional sequence order (5’-to-3’ for nucleic acid sequences, and amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes are permitted in Sequence Strings.' + 'description': "A character string of Residues that represents a biological sequence using the conventional sequence order (5'-to-3' for nucleic acid sequences, and amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes are permitted in Sequence Strings." }, ) @@ -357,9 +325,7 @@ class LengthExpression(_ValueObject): type: Literal['LengthExpression'] = Field( 'LengthExpression', description='MUST be "LengthExpression"' ) - length: Optional[Union[Range, int]] = Field( - None - ) + length: Optional[Union[Range, int]] = None class ga4gh(_ValueObject.ga4gh): keys = [ @@ -378,10 +344,10 @@ class ReferenceLengthExpression(_ValueObject): ..., description='The number of residues of the expressed sequence.' ) sequence: Optional[SequenceString] = Field( - None, description='the Sequence encoded by the Reference Length Expression.' + None, description='the `Sequence` encoded by the Reference Length Expression.' ) repeatSubunitLength: int = Field( - None, description='The number of residues of the repeat subunit.' + ..., description='The number of residues of the repeat subunit.' ) class ga4gh(_ValueObject.ga4gh): @@ -415,16 +381,13 @@ class ga4gh(_ValueObject.ga4gh): class SequenceReference(_ValueObject): """A sequence of nucleic or amino acid character codes.""" - model_config = ConfigDict( - use_enum_values=True - ) - type: Literal['SequenceReference'] = Field('SequenceReference', description='MUST be "SequenceReference"') refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field( ..., description='A `GA4GH RefGet ` identifier for the referenced sequence, using the sha512t24u digest.', ) - residueAlphabet: Optional[ResidueAlphabet] = None + residueAlphabet: Optional[ResidueAlphabet] = Field(None, description="The interpretation of the character codes referred to by the refget accession, where 'aa' specifies an amino acid character set, and 'na' specifies a nucleic acid character set.") + circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).") class ga4gh(_ValueObject.ga4gh): keys = [ @@ -438,7 +401,7 @@ class SequenceLocation(_Ga4ghIdentifiableObject): type: Literal['SequenceLocation'] = Field('SequenceLocation', description='MUST be "SequenceLocation"') sequenceReference: Optional[Union[IRI, SequenceReference]] = Field( - None, description='A SequenceReference.' + None, description='A reference to a `Sequence` on which the location is defined.' ) start: Optional[Union[Range, int]] = Field( None, @@ -449,6 +412,8 @@ class SequenceLocation(_Ga4ghIdentifiableObject): description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.', ) + sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.") + def get_refget_accession(self): if isinstance(self.sequenceReference, SequenceReference): return self.sequenceReference.refgetAccession @@ -476,6 +441,55 @@ class _VariationBase(_Ga4ghIdentifiableObject): expressions: Optional[List[Expression]] = None +######################################### +# vrs molecular variation +######################################### + + +class Allele(_VariationBase): + """The state of a molecule at a `Location`.""" + + type: Literal['Allele'] = Field('Allele', description='MUST be "Allele"') + location: Union[IRI, SequenceLocation] = Field( + ..., description='The location of the Allele' + ) + state: Union[LiteralSequenceExpression, ReferenceLengthExpression, LengthExpression] = Field( + ..., description='An expression of the sequence state' + ) + + class ga4gh(_Ga4ghIdentifiableObject.ga4gh): + prefix = 'VA' + keys = [ + 'location', + 'state', + 'type' + ] + + +class CisPhasedBlock(_VariationBase): + """An ordered set of co-occurring `Variation` on the same molecule.""" + + type: Literal['CisPhasedBlock'] = Field('CisPhasedBlock', description='MUST be "CisPhasedBlock"') + members: List[Union[Allele, IRI]] = Field( + ..., + description='A list of `Alleles` that are found in-cis on a shared molecule.', + min_length=2, + ) + sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.") + + @model_serializer(when_used="json") + def ga4gh_serialize(self) -> Dict: + out = _ValueObject.ga4gh_serialize(self) + out["members"] = sorted(out["members"]) + return out + + class ga4gh(_Ga4ghIdentifiableObject.ga4gh): + prefix = 'CPB' + keys = [ + 'members', + 'type' + ] + ######################################### # vrs structural variation (under active discussion) @@ -488,21 +502,18 @@ class Adjacency(_VariationBase): potentially with an intervening linker sequence. """ - model_config = ConfigDict( - use_enum_values=True - ) - type: Literal['Adjacency'] = Field('Adjacency', description='MUST be "Adjacency"') adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field( ..., description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.", - min_length=1, + min_length=2, max_length=2, ) - linker: Optional[Union[LiteralSequenceExpression, ReferenceLengthExpression]] = Field( + linker: Optional[Union[LiteralSequenceExpression, ReferenceLengthExpression, LengthExpression]] = Field( None, - description="he sequence found between adjoined sequences." + description="The sequence found between adjoined sequences." ) + homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty (false).") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = 'AJ' @@ -513,52 +524,40 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh): ] -######################################### -# vrs molecular variation -######################################### - - -class Allele(_VariationBase): - """The state of a molecule at a Location.""" +class SequenceTerminus(_VariationBase): + """The `SequenceTerminus` data class provides a structure for describing the end + (terminus) of a sequence. Structurally similar to Adjacency but the linker sequence + is not allowed and it removes the unnecessary array structure. + """ - type: Literal['Allele'] = Field('Allele', description='MUST be "Allele"') - location: Union[IRI, SequenceLocation] = Field( - ..., description='The location of the Allele' - ) - state: Union[LiteralSequenceExpression, ReferenceLengthExpression] = Field( - ..., description='An expression of the sequence state' - ) + type: Literal["SequenceTerminus"] = Field("SequenceTerminus", description='MUST be "SequenceTerminus"') + location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): - prefix = 'VA' + prefix = "SQX" keys = [ - 'location', - 'state', - 'type' + "location", + "type" ] -class Haplotype(_VariationBase): - """An ordered set of co-occurring Variation on the same molecule.""" +class DerivativeSequence(_VariationBase): + """The "Derivative Sequence" data class is a structure for describing a derivate + sequence composed from multiple sequence adjacencies. + """ - type: Literal['Haplotype'] = Field('Haplotype', description='MUST be "Haplotype"') - members: List[Union[Adjacency, Allele, IRI]] = Field( + type: Literal["DerivativeSequence"] = Field("DerivativeSequence", description='MUST be "DerivativeSequence"') + components: List[Union[IRI, Adjacency, Allele, SequenceTerminus, CisPhasedBlock]] = Field( ..., - description='A list of Alleles and Adjacencies that comprise a Haplotype. Members must share the same reference sequence as adjacent members. Alleles should not have overlapping or adjacent coordinates with neighboring Alleles. Neighboring alleles should be ordered by ascending coordinates, unless represented on a DNA inversion (following an Adjacency with end-defined adjoinedSequences), in which case they should be ordered in descending coordinates. Sequence references MUST be consistent for all members between and including the end of one Adjacency and the beginning of another.', - min_length=2, + description="The sequence components that make up the derivative sequence.", + min_length=2 ) - @model_serializer(when_used='json') - def ga4gh_serialize(self) -> Dict: - out = _ValueObject.ga4gh_serialize(self) - out['members'] = sorted(out['members']) - return out - class ga4gh(_Ga4ghIdentifiableObject.ga4gh): - prefix = 'HT' + prefix = "DSQ" keys = [ - 'members', - 'type' + "components", + "type" ] @@ -599,9 +598,6 @@ class CopyNumberChange(_CopyNumber): """An assessment of the copy number of a `Location` or a `Gene` within a system (e.g. genome, cell, etc.) relative to a baseline ploidy. """ - model_config = ConfigDict( - use_enum_values=True - ) type: Literal['CopyNumberChange'] = Field('CopyNumberChange', description='MUST be "CopyNumberChange"') copyChange: CopyChange = Field( @@ -624,22 +620,22 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh): class MolecularVariation(RootModel): - """A variation on a contiguous molecule.""" + """A `variation` on a contiguous molecule.""" - root: Union[Allele, Haplotype] = Field( + root: Union[Allele, CisPhasedBlock, Adjacency, SequenceTerminus, DerivativeSequence] = Field( ..., json_schema_extra={ - 'description': 'A variation on a contiguous molecule.' + 'description': 'A `variation` on a contiguous molecule.' }, discriminator='type' ) class SequenceExpression(RootModel): - """An expression describing a Sequence.""" + """An expression describing a `Sequence`.""" - root: Union[LiteralSequenceExpression, ReferenceLengthExpression] = Field( + root: Union[LiteralSequenceExpression, ReferenceLengthExpression, LengthExpression] = Field( ..., - json_schema_extra={'description': 'An expression describing a Sequence.'}, + json_schema_extra={'description': 'An expression describing a `Sequence`.'}, discriminator='type' ) @@ -659,7 +655,7 @@ class Location(RootModel): class Variation(RootModel): """A representation of the state of one or more biomolecules.""" - root: Union[Allele, CopyNumberChange, CopyNumberCount, Haplotype] = Field( + root: Union[Allele, CisPhasedBlock, Adjacency, SequenceTerminus, DerivativeSequence, CopyNumberChange, CopyNumberCount] = Field( ..., json_schema_extra={ 'description': 'A representation of the state of one or more biomolecules.' diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 4c0eac6f..33dc70a5 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -257,7 +257,7 @@ def _is_valid_cycle(template_start, template, target): # TODO _normalize_genotype? -def _normalize_haplotype(o, data_proxy=None): +def _normalize_cis_phased_block(o, data_proxy=None): o.members = sorted(o.members, key=ga4gh_digest) return o @@ -265,7 +265,7 @@ def _normalize_haplotype(o, data_proxy=None): handlers = { "Allele": _normalize_allele, - "Haplotype": _normalize_haplotype, + "CisPhasedBlock": _normalize_cis_phased_block, } diff --git a/submodules/vrs b/submodules/vrs index 454c5312..ca301809 160000 --- a/submodules/vrs +++ b/submodules/vrs @@ -1 +1 @@ -Subproject commit 454c5312e8e425eb170901c7520311f3ca7904e3 +Subproject commit ca3018094397d62025a65b12c991c6189f4275af diff --git a/tests/test_vrs.py b/tests/test_vrs.py index d3a7a41f..dc8d40c0 100644 --- a/tests/test_vrs.py +++ b/tests/test_vrs.py @@ -1,5 +1,18 @@ -from ga4gh.core import sha512t24u, ga4gh_digest, ga4gh_serialize, ga4gh_identify, is_pydantic_instance -from ga4gh.vrs import models, vrs_deref, vrs_enref +from pydantic import ValidationError +import pytest + +from ga4gh.core import ( + sha512t24u, + ga4gh_digest, + ga4gh_serialize, + ga4gh_identify, + is_pydantic_instance, + is_curie_type, + pydantic_copy, + use_ga4gh_compute_identifier_when, + VrsObjectIdentifierIs +) +from ga4gh.vrs import models, vrs_enref, vrs_deref allele_dict = { 'location': { @@ -20,46 +33,327 @@ a = models.Allele(**allele_dict) +allele_383650_dict = { + "type": "Allele", + "digest": "SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d", + "id": "ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d", + "location": { + "id": "ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe", + "digest": "TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" + }, + "start": 128325834, + "end": 128325835 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "T" + } +} +allele_417816_dict = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" + }, + "start": 128325809, + "end": 128325810 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "T" + } +} +allele_280320_dict = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" + }, + "start": 128322879, + "end": 128322891 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "G" + } +} +allele_383650 = models.Allele(**allele_383650_dict) +allele_417816 = models.Allele(**allele_417816_dict) +allele_280320 = models.Allele(**allele_280320_dict) -def test_vr(): +cpb_431012_dict = { + "type": "CisPhasedBlock", + "members": [allele_383650_dict, allele_417816_dict] +} +cpb_431012 = models.CisPhasedBlock(**cpb_431012_dict) - # assert a.model_dump() == allele_dict # TODO with model_config['extra'] == allow this assertion will always fail +def test_vr(): + assert a.model_dump(exclude_none=True) == allele_dict + assert is_pydantic_instance(a) assert is_pydantic_instance(a.location) + assert is_pydantic_instance(a.location.sequenceReference) - assert ga4gh_serialize( - a.location - ) == b'{"end":55181320,"sequenceReference":{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"},"start":55181319,"type":"SequenceLocation"}' - assert sha512t24u(ga4gh_serialize(a.location)) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' - assert ga4gh_digest(a.location) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' - assert ga4gh_identify(a.location) == 'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' + # Sequence Reference + seqref = a.location.sequenceReference + seqref_serialized = ga4gh_serialize(seqref) + assert seqref_serialized == b'{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"}' + assert ga4gh_digest(seqref) is None + assert ga4gh_identify(seqref) is None - assert ga4gh_serialize( - a - ) == b'{"location":"_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}' + # Location + loc = a.location + loc_serialized = ga4gh_serialize(loc) + assert loc_serialized == b'{"end":55181320,"sequenceReference":{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"},"start":55181319,"type":"SequenceLocation"}' + assert sha512t24u(loc_serialized) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' + assert ga4gh_digest(loc) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' + assert ga4gh_identify(loc) == 'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' + + # Allele + allele_serialized = ga4gh_serialize(a) + assert allele_serialized == b'{"location":"_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}' + assert sha512t24u(allele_serialized) == 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' assert ga4gh_digest(a) == 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' assert ga4gh_identify(a) == 'ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' - # assert a.model_dump(exclude_none=True) == { - # 'location': { - # 'end': 55181320, - # 'sequence': 'ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', - # 'start': 55181319, - # 'type': 'SequenceLocation' - # }, - # 'state': { - # 'sequence': 'T', - # 'type': 'LiteralSequenceExpression' - # }, - # 'type': 'Allele' - # } - - # vros = {} - # a2 = vrs_enref(a, vros) - # assert ga4gh_identify(a) == ga4gh_identify(a2) - # assert a2.location == "ga4gh:SL.Npx4j5beiNN9GSFTm8Ml6YxrNj_Ghkac" - # assert a2.location in vros - # assert ga4gh_identify(a) in vros - # - # a3 = vrs_deref(a2, vros) - # assert a == a3 + with pytest.raises(ValidationError): + models.Allele(**{ + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + # refgetAccession can't include a namespace prefix + "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" + }, + "start": 128325834, + "end": 128325835 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "T" + } + }) + with pytest.raises(ValidationError): + models.Allele(**{ + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" + }, + "start": 128325834, + "end": 128325835 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "T" + }, + # digest can't include a namespace prefix + "digest": "ga4gh:734G5mtNwe40do8F6GKuqQP4QxyjBqVp" + }) + + +def test_cpb(): + assert cpb_431012.model_dump(exclude_none=True) == cpb_431012_dict + assert is_pydantic_instance(cpb_431012) + cpb_serialized = ga4gh_serialize(cpb_431012) + assert cpb_serialized == b'{"members":["SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d","TKhpDsfclpSXpn6BjTLViB_ceqRerOd2"],"type":"CisPhasedBlock"}' + assert sha512t24u(cpb_serialized) == 'x8GH5G73cPMs37jy1-9mJjWynu324rxI' + assert ga4gh_digest(cpb_431012) == 'x8GH5G73cPMs37jy1-9mJjWynu324rxI' + assert ga4gh_identify(cpb_431012) == 'ga4gh:CPB.x8GH5G73cPMs37jy1-9mJjWynu324rxI' + + +def test_ga4gh_iri(): + iri = models.IRI.model_construct("ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE") + assert is_curie_type(iri) + assert iri.root == pydantic_copy(iri).root + assert ga4gh_serialize(iri) == b'"Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE"' + + +def test_enref(): + object_store = {} + allele_383650.get_or_create_ga4gh_identifier() + allele_383650_enreffed = vrs_enref(allele_383650, object_store=object_store) + orig_no_loc = allele_383650.model_dump().copy() + orig_no_loc.pop("location") + actual_no_loc = allele_383650_enreffed.model_dump().copy() + actual_no_loc.pop("location") + assert actual_no_loc == orig_no_loc, "Original and enreffed match except for enreffed field" + assert allele_383650_enreffed.location == 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe' + assert (allele_383650_enreffed.model_dump(exclude_none=True) == { + 'digest': 'SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d', + 'id': 'ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d', + 'type': 'Allele', + 'location': 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', + 'state': { + 'type': 'LiteralSequenceExpression', + 'sequence': 'T'}}) + + + dereffed = vrs_deref(allele_383650_enreffed, object_store=object_store) + assert (dereffed.location.model_dump(exclude_none=True) == { + 'digest': 'TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', + 'id': 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI' + }, + 'start': 128325834, + 'end': 128325835}) + assert dereffed.location.model_dump(exclude_none=True) == allele_383650.location.model_dump(exclude_none=True) + assert dereffed.model_dump() == allele_383650.model_dump() + + +def test_enref2(): + object_store = {} + a = { + "type": "Allele", + "id": "ga4gh:VA.LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI", + "digest": "LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI", + "location": { + "id": "ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz", + "digest": "wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl" + }, + "start": 44908821, + "end": 44908822 + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "T" + } + } + vo_a = models.Allele(**a) + a_enreffed = vrs_enref(vo_a, object_store=object_store) + orig_no_loc = vo_a.model_dump().copy() + orig_no_loc.pop("location") + actual_no_loc = a_enreffed.model_dump().copy() + actual_no_loc.pop("location") + assert orig_no_loc == actual_no_loc, "Original and enreffed match except for enreffed field" + assert a_enreffed.location == 'ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz' + assert a_enreffed.model_dump(exclude_none=True) == { + 'id': 'ga4gh:VA.LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI', + 'digest': 'LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI', + 'type': 'Allele', + 'location': 'ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz', + 'state': { + 'type': 'LiteralSequenceExpression', + 'sequence': 'T' + } + } + + +def test_class_refatt_map(): + class_refatt_map_expected = { + 'Allele': ['location'], + 'CisPhasedBlock': ['members'], + '_CopyNumber': ['location'], + 'CopyNumberCount': ['location'], + 'CopyNumberChange': ['location'], + 'Adjacency': ['adjoinedSequences'], + 'DerivativeSequence': ['components'], + 'SequenceTerminus': ['location'] + } + assert class_refatt_map_expected == models.class_refatt_map + + +def test_compute_identifiers_when(): + a = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.jdEWLvLvT8827O59m1Agh5H3n6kTzBsJ", + }, + "start": 44908821, + "end": 44908822, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "T"}, + } + correct_id = "ga4gh:VA.NRUtY5Jcoevxr0tIgbNa-oIFm-Gv4qas" + syntax_valid_id = "ga4gh:VA.39eae078d9bb30da2a5c5d1969cb1472" + syntax_invalid_id = "ga4gh:12345" + + # when id property is missing + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + + # when id property is none + a["id"] = None + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + + # when id property is blank + a["id"] = "" + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + + # when id property is syntactically invalid + a["id"] = syntax_invalid_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == syntax_invalid_id + + # when id property is syntactically valid + a["id"] = syntax_valid_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == syntax_valid_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == syntax_valid_id + + # when id property is correct + a["id"] = correct_id + vo_a = models.Allele(**a) + assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert ga4gh_identify(vo_a, in_place='never') is not correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert ga4gh_identify(vo_a, in_place='never') is not correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert ga4gh_identify(vo_a, in_place='never') is correct_id + with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): + assert ga4gh_identify(vo_a, in_place='never') == correct_id + assert ga4gh_identify(vo_a, in_place='never') is correct_id diff --git a/tests/test_vrs2.py b/tests/test_vrs2.py deleted file mode 100644 index 7af994d8..00000000 --- a/tests/test_vrs2.py +++ /dev/null @@ -1,397 +0,0 @@ -from pydantic import ValidationError -import pytest - -from ga4gh.core import ( - sha512t24u, - ga4gh_digest, - ga4gh_serialize, - ga4gh_identify, - is_pydantic_instance, - is_curie_type, - pydantic_copy, - use_ga4gh_compute_identifier_when, - VrsObjectIdentifierIs -) -from ga4gh.vrs import models, vrs_enref, vrs_deref - -allele_dict = { - 'location': { - 'end': 55181320, - 'start': 55181319, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul' - }, - 'type': 'SequenceLocation' - }, - 'state': { - 'sequence': 'T', - 'type': 'LiteralSequenceExpression' - }, - 'type': 'Allele' -} - -a = models.Allele(**allele_dict) - -allele_383650_dict = { - "type": "Allele", - "digest": "SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d", - "id": "ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d", - "location": { - "id": "ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe", - "digest": "TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" - }, - "start": 128325834, - "end": 128325835 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "T" - } -} -allele_417816_dict = { - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" - }, - "start": 128325809, - "end": 128325810 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "T" - } -} -allele_280320_dict = { - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" - }, - "start": 128322879, - "end": 128322891 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "G" - } -} -allele_383650 = models.Allele(**allele_383650_dict) -allele_417816 = models.Allele(**allele_417816_dict) -allele_280320 = models.Allele(**allele_280320_dict) - -haplotype_431012_dict = { - "type": "Haplotype", - "members": [allele_383650_dict, allele_417816_dict] -} -haplotype_431012 = models.Haplotype(**haplotype_431012_dict) - -# genotype_431013_dict = { -# "type": "Genotype", -# "count": 1, -# "members": [ -# { -# "type": "GenotypeMember", -# "variation": haplotype_431012_dict, -# "count": 1 -# }, -# { -# "type": "GenotypeMember", -# "variation": allele_280320_dict, -# "count": 1 -# } -# ] -# } -# genotype_431013 = models.Genotype(**genotype_431013_dict) - - -def test_vr(): - assert a.model_dump(exclude_none=True) == allele_dict - assert is_pydantic_instance(a) - assert is_pydantic_instance(a.location) - assert is_pydantic_instance(a.location.sequenceReference) - - # Sequence Reference - seqref = a.location.sequenceReference - seqref_serialized = ga4gh_serialize(seqref) - assert seqref_serialized == b'{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"}' - assert ga4gh_digest(seqref) is None - assert ga4gh_identify(seqref) is None - - # Location - loc = a.location - loc_serialized = ga4gh_serialize(loc) - assert loc_serialized == b'{"end":55181320,"sequenceReference":{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"},"start":55181319,"type":"SequenceLocation"}' - assert sha512t24u(loc_serialized) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' - assert ga4gh_digest(loc) == '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' - assert ga4gh_identify(loc) == 'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd' - - # Allele - allele_serialized = ga4gh_serialize(a) - assert allele_serialized == b'{"location":"_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}' - assert sha512t24u(allele_serialized) == 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' - assert ga4gh_digest(a) == 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' - assert ga4gh_identify(a) == 'ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE' - - # Commenting out enref/deref tests. - # We are deciding whether this will continue to be included in this library. - # vros = {} - # a2 = vrs_enref(a, vros) - # assert ga4gh_identify(a) == ga4gh_identify(a2) - # assert a2.location == "ga4gh:SL.Npx4j5beiNN9GSFTm8Ml6YxrNj_Ghkac" - # assert a2.location in vros - # assert ga4gh_identify(a) in vros - # a3 = vrs_deref(a2, vros) - # assert a == a3 - - with pytest.raises(ValidationError): - models.Allele(**{ - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - # refgetAccession can't include a namespace prefix - "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" - }, - "start": 128325834, - "end": 128325835 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "T" - } - }) - with pytest.raises(ValidationError): - models.Allele(**{ - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI" - }, - "start": 128325834, - "end": 128325835 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "T" - }, - # digest can't include a namespace prefix - "digest": "ga4gh:734G5mtNwe40do8F6GKuqQP4QxyjBqVp" - }) - - -def test_haplotype(): - assert haplotype_431012.model_dump(exclude_none=True) == haplotype_431012_dict - assert is_pydantic_instance(haplotype_431012) - haplotype_serialized = ga4gh_serialize(haplotype_431012) - assert haplotype_serialized == b'{"members":["SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d","TKhpDsfclpSXpn6BjTLViB_ceqRerOd2"],"type":"Haplotype"}' - assert sha512t24u(haplotype_serialized) == 'kAFlqAFWNj5xZIv5G_ePM7xepXe5p8TK' - assert ga4gh_digest(haplotype_431012) == 'kAFlqAFWNj5xZIv5G_ePM7xepXe5p8TK' - assert ga4gh_identify(haplotype_431012) == 'ga4gh:HT.kAFlqAFWNj5xZIv5G_ePM7xepXe5p8TK' - - -@pytest.mark.skip(reason="Genotypes are not yet supported in 2.x") -def test_genotype(): - assert genotype_431013.model_dump(exclude_none=True) == genotype_431013_dict - assert is_pydantic_instance(genotype_431013) - genotype_serialized = ga4gh_serialize(genotype_431013) - assert genotype_serialized == b'{"count":1,"members":[{"count":1,"type":"GenotypeMember","variation":"fFR5oRpeD8Cuq2hfs3bXd1rgJUQrQA26"},{"count":1,"type":"GenotypeMember","variation":"AUYSTKn2HElZ_Gg-Cv9Pm6Yx9Xpvx8Tm"}],"type":"Genotype"}' - assert sha512t24u(genotype_serialized) == '51J0mMryCGjdce3qBpqNt4n_hXUQmw83' - assert ga4gh_digest(genotype_431013) == '51J0mMryCGjdce3qBpqNt4n_hXUQmw83' - assert ga4gh_identify(genotype_431013) == 'ga4gh:GT.51J0mMryCGjdce3qBpqNt4n_hXUQmw83' - - -def test_ga4gh_iri(): - iri = models.IRI.model_construct("ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE") - assert is_curie_type(iri) - assert iri.root == pydantic_copy(iri).root - assert ga4gh_serialize(iri) == b'"Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE"' - - -def test_enref(): - object_store = {} - allele_383650.get_or_create_ga4gh_identifier() - allele_383650_enreffed = vrs_enref(allele_383650, object_store=object_store) - orig_no_loc = allele_383650.model_dump().copy() - orig_no_loc.pop("location") - actual_no_loc = allele_383650_enreffed.model_dump().copy() - actual_no_loc.pop("location") - assert actual_no_loc == orig_no_loc, "Original and enreffed match except for enreffed field" - assert allele_383650_enreffed.location == 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe' - assert (allele_383650_enreffed.model_dump(exclude_none=True) == { - 'digest': 'SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d', - 'id': 'ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d', - 'type': 'Allele', - 'location': 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', - 'state': { - 'type': 'LiteralSequenceExpression', - 'sequence': 'T'}}) - - - dereffed = vrs_deref(allele_383650_enreffed, object_store=object_store) - assert (dereffed.location.model_dump(exclude_none=True) == { - 'digest': 'TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', - 'id': 'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI' - }, - 'start': 128325834, - 'end': 128325835}) - assert dereffed.location.model_dump(exclude_none=True) == allele_383650.location.model_dump(exclude_none=True) - assert dereffed.model_dump() == allele_383650.model_dump() - - -def test_enref2(): - object_store = {} - a = { - "type": "Allele", - "id": "ga4gh:VA.LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI", - "digest": "LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI", - "location": { - "id": "ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz", - "digest": "wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl" - }, - "start": 44908821, - "end": 44908822 - }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "T" - } - } - vo_a = models.Allele(**a) - a_enreffed = vrs_enref(vo_a, object_store=object_store) - orig_no_loc = vo_a.model_dump().copy() - orig_no_loc.pop("location") - actual_no_loc = a_enreffed.model_dump().copy() - actual_no_loc.pop("location") - assert orig_no_loc == actual_no_loc, "Original and enreffed match except for enreffed field" - assert a_enreffed.location == 'ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz' - assert a_enreffed.model_dump(exclude_none=True) == { - 'id': 'ga4gh:VA.LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI', - 'digest': 'LDzK5JahEZG2Ua_5itDtVV8v3O1ptTgI', - 'type': 'Allele', - 'location': 'ga4gh:SL.wIlaGykfwHIpPY2Fcxtbx4TINbbODFVz', - 'state': { - 'type': 'LiteralSequenceExpression', - 'sequence': 'T' - } - } - - -def test_class_refatt_map(): - class_refatt_map_expected = { - 'Allele': ['location'], - 'Haplotype': ['members'], - '_CopyNumber': ['location'], - 'CopyNumberCount': ['location'], - 'CopyNumberChange': ['location'], - 'Adjacency': ['adjoinedSequences'], - } - assert class_refatt_map_expected == models.class_refatt_map - - -def test_compute_identifiers_when(): - a = { - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.jdEWLvLvT8827O59m1Agh5H3n6kTzBsJ", - }, - "start": 44908821, - "end": 44908822, - }, - "state": {"type": "LiteralSequenceExpression", "sequence": "T"}, - } - correct_id = "ga4gh:VA.NRUtY5Jcoevxr0tIgbNa-oIFm-Gv4qas" - syntax_valid_id = "ga4gh:VA.39eae078d9bb30da2a5c5d1969cb1472" - syntax_invalid_id = "ga4gh:12345" - - # when id property is missing - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - - # when id property is none - a["id"] = None - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - - # when id property is blank - a["id"] = "" - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - - # when id property is syntactically invalid - a["id"] = syntax_invalid_id - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == syntax_invalid_id - - # when id property is syntactically valid - a["id"] = syntax_valid_id - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == syntax_valid_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == syntax_valid_id - - # when id property is correct - a["id"] = correct_id - vo_a = models.Allele(**a) - assert ga4gh_identify(vo_a, in_place='never') == correct_id - assert ga4gh_identify(vo_a, in_place='never') is not correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.ANY): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - assert ga4gh_identify(vo_a, in_place='never') is not correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.GA4GH_INVALID): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - assert ga4gh_identify(vo_a, in_place='never') is correct_id - with use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING): - assert ga4gh_identify(vo_a, in_place='never') == correct_id - assert ga4gh_identify(vo_a, in_place='never') is correct_id From a2477e1efc98659f9c608084dcd5707793c481e6 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 10 Jul 2024 10:41:59 -0400 Subject: [PATCH 3/4] feat: create enum for gks-common + vrs types (#420) close #309 --- src/ga4gh/core/_internal/models.py | 78 +++++++++++++++++++----------- src/ga4gh/vrs/_internal/models.py | 47 ++++++++++++------ 2 files changed, 82 insertions(+), 43 deletions(-) diff --git a/src/ga4gh/core/_internal/models.py b/src/ga4gh/core/_internal/models.py index 03d03b67..4ab17edc 100644 --- a/src/ga4gh/core/_internal/models.py +++ b/src/ga4gh/core/_internal/models.py @@ -176,6 +176,15 @@ class Expression(BaseModel): ######################################### +class CommonEntityType(str, Enum): + """Define GKS Common Entity types""" + + AGENT = "Agent" + CONTRIBUTION = "Contribution" + DOCUMENT = "Document" + METHOD = "Method" + + class _Entity(BaseModel): """Entity is the root class of the 'gks-common' core information model classes - those that have identifiers and other general metadata like labels, xrefs, urls, @@ -216,7 +225,7 @@ class Agent(_Entity): entity, or for another agent's activity. """ - type: Literal["Agent"] = Field("Agent", description="MUST be 'Agent'.") + type: Literal[CommonEntityType.AGENT] = Field(CommonEntityType.AGENT, description=f"MUST be '{CommonEntityType.AGENT.value}'.") name: Optional[str] = Field(None, description="The descriptive name of the agent.") subtype: Optional[AgentSubtype] = Field(None, description="A more specific type of agent the agent represents.") @@ -254,7 +263,7 @@ class Contribution(Activity): DataItem, Publication, etc.) """ - type: Literal["Contribution"] = "Contribution" + type: Literal[CommonEntityType.CONTRIBUTION] = CommonEntityType.CONTRIBUTION contributor: Optional[Agent] = Field(None, description="The agent that made the contribution.") contributionMadeTo: Optional[_InformationEntity] = Field(None, description="The artifact toward which the contribution was made.") # noqa: N815 activityType: Optional[Coding] = Field(None, description="SHOULD describe a concept descending from the Contributor Role Ontology.") @@ -287,7 +296,7 @@ class _InformationEntity(_Entity): class Document(_InformationEntity): """a representation of a physical or digital document""" - type: Literal["Document"] = "Document" + type: Literal[CommonEntityType.DOCUMENT] = CommonEntityType.DOCUMENT subtype: Optional[Coding] = Field( None, description="A more specific type for the document (e.g. a publication, patent, pathology report)" ) @@ -310,7 +319,7 @@ class Method(_InformationEntity): experimental protocols, curation guidelines, rule sets, etc.) """ - type: Literal["Method"] = Field("Method", description="MUST be 'Method'.") + type: Literal[CommonEntityType.METHOD] = Field(CommonEntityType.METHOD, description=f"MUST be '{CommonEntityType.METHOD.value}'.") isReportedIn: Optional[Union[Document, IRI]] = None # noqa: N815 subtype: Optional[Coding] = Field( None, @@ -323,12 +332,25 @@ class Method(_InformationEntity): # GKS Common Domain Entities ######################################### + +class CommonDomainType(str, Enum): + """Define GKS Common Domain Entity types""" + + PHENOTYPE = "Phenotype" + DISEASE = "Disease" + TRAIT_SET = "TraitSet" + TR_ACTION = "TherapeuticAction" + TR_AGENT = "TherapeuticAgent" + TR_SUB = "TherapeuticSubstituteGroup" + TR_COMB = "CombinationTherapy" + GENE = "Gene" + class Phenotype(_DomainEntity): """An observable characteristic or trait of an organism.""" - type: Literal['Phenotype'] = Field( - 'Phenotype', - description='MUST be "Phenotype".' + type: Literal[CommonDomainType.PHENOTYPE] = Field( + CommonDomainType.PHENOTYPE, + description=f'MUST be "{CommonDomainType.PHENOTYPE.value}".' ) @@ -337,18 +359,18 @@ class Disease(_DomainEntity): of all or part of an organism and is not immediately due to any external injury. """ - type: Literal['Disease'] = Field( - 'Disease', - description='MUST be "Disease".' + type: Literal[CommonDomainType.DISEASE] = Field( + CommonDomainType.DISEASE, + description=f'MUST be "{CommonDomainType.DISEASE.value}".' ) class TraitSet(_DomainEntity): """A set of phenotype and/or disease concepts that together constitute a condition.""" - type: Literal['TraitSet'] = Field( - 'TraitSet', - description='MUST be "TraitSet".' + type: Literal[CommonDomainType.TRAIT_SET] = Field( + CommonDomainType.TRAIT_SET, + description=f'MUST be "{CommonDomainType.TRAIT_SET.value}".' ) traits: List[Union[Disease, Phenotype]] = Field( ..., @@ -369,27 +391,27 @@ class Condition(RootModel): class TherapeuticAction(_DomainEntity): """A therapeutic action taken that is intended to alter or stop a pathologic process.""" - type: Literal['TherapeuticAction'] = Field( - 'TherapeuticAction', - description='MUST be "TherapeuticAction".' + type: Literal[CommonDomainType.TR_ACTION] = Field( + CommonDomainType.TR_ACTION, + description=f'MUST be "{CommonDomainType.TR_ACTION.value}".' ) class TherapeuticAgent(_DomainEntity): """An administered therapeutic agent that is intended to alter or stop a pathologic process.""" - type: Literal['TherapeuticAgent'] = Field( - 'TherapeuticAgent', - description='MUST be "TherapeuticAgent".' + type: Literal[CommonDomainType.TR_AGENT] = Field( + CommonDomainType.TR_AGENT, + description=f'MUST be "{CommonDomainType.TR_AGENT.value}".' ) class TherapeuticSubstituteGroup(_DomainEntity): """A group of therapeutic procedures that may be treated as substitutes for one another.""" - type: Literal['TherapeuticSubstituteGroup'] = Field( - 'TherapeuticSubstituteGroup', - description='MUST be "TherapeuticSubstituteGroup".' + type: Literal[CommonDomainType.TR_SUB] = Field( + CommonDomainType.TR_SUB, + description=f'MUST be "{CommonDomainType.TR_SUB.value}".' ) substitutes: List[Union[TherapeuticAction, TherapeuticAgent]] = Field( ..., @@ -403,9 +425,9 @@ class CombinationTherapy(_DomainEntity): performed in combination. """ - type: Literal['CombinationTherapy'] = Field( - 'CombinationTherapy', - description='MUST be "CombinationTherapy".' + type: Literal[CommonDomainType.TR_COMB] = Field( + CommonDomainType.TR_COMB, + description=f'MUST be "{CommonDomainType.TR_COMB.value}".' ) components: List[Union[TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent]] = Field( ..., @@ -429,7 +451,7 @@ class TherapeuticProcedure(RootModel): class Gene(_DomainEntity): """A basic physical and functional unit of heredity.""" - type: Literal['Gene'] = Field( - 'Gene', - description='MUST be "Gene".' + type: Literal[CommonDomainType.GENE] = Field( + CommonDomainType.GENE, + description=f'MUST be "{CommonDomainType.GENE.value}".' ) diff --git a/src/ga4gh/vrs/_internal/models.py b/src/ga4gh/vrs/_internal/models.py index 66064e75..44ca885b 100644 --- a/src/ga4gh/vrs/_internal/models.py +++ b/src/ga4gh/vrs/_internal/models.py @@ -125,6 +125,23 @@ def pydantic_class_refatt_map(): class_keys) +class VrsTypes(str, Enum): + """Define VRS Types""" + + LEN_EXPR = "LengthExpression" + REF_LEN_EXPR = "ReferenceLengthExpression" + LIT_SEQ_EXPR = "LiteralSequenceExpression" + SEQ_REF = "SequenceReference" + SEQ_LOC = "SequenceLocation" + ALLELE = "Allele" + CIS_PHASED_BLOCK = "CisPhasedBlock" + ADJACENCY = "Adjacency" + SEQ_TERMINUS = "SequenceTerminus" + DERIVATIVE_SEQ = "DerivativeSequence" + CN_COUNT = "CopyNumberCount" + CN_CHANGE = "CopyNumberChange" + + class ResidueAlphabet(str, Enum): """The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid @@ -322,8 +339,8 @@ class SequenceString(RootModel): class LengthExpression(_ValueObject): """A sequence expressed only by its length.""" - type: Literal['LengthExpression'] = Field( - 'LengthExpression', description='MUST be "LengthExpression"' + type: Literal[VrsTypes.LEN_EXPR] = Field( + VrsTypes.LEN_EXPR, description=f'MUST be "{VrsTypes.LEN_EXPR.value}"' ) length: Optional[Union[Range, int]] = None @@ -337,8 +354,8 @@ class ga4gh(_ValueObject.ga4gh): class ReferenceLengthExpression(_ValueObject): """An expression of a length of a sequence from a repeating reference.""" - type: Literal['ReferenceLengthExpression'] = Field( - 'ReferenceLengthExpression', description='MUST be "ReferenceLengthExpression"' + type: Literal[VrsTypes.REF_LEN_EXPR] = Field( + VrsTypes.REF_LEN_EXPR, description=f'MUST be "{VrsTypes.REF_LEN_EXPR.value}"' ) length: Union[Range, int] = Field( ..., description='The number of residues of the expressed sequence.' @@ -361,8 +378,8 @@ class ga4gh(_ValueObject.ga4gh): class LiteralSequenceExpression(_ValueObject): """An explicit expression of a Sequence.""" - type: Literal['LiteralSequenceExpression'] = Field( - 'LiteralSequenceExpression', description='MUST be "LiteralSequenceExpression"' + type: Literal[VrsTypes.LIT_SEQ_EXPR] = Field( + VrsTypes.LIT_SEQ_EXPR, description=f'MUST be "{VrsTypes.LIT_SEQ_EXPR.value}"' ) sequence: SequenceString = Field(..., description='the literal sequence') @@ -381,7 +398,7 @@ class ga4gh(_ValueObject.ga4gh): class SequenceReference(_ValueObject): """A sequence of nucleic or amino acid character codes.""" - type: Literal['SequenceReference'] = Field('SequenceReference', description='MUST be "SequenceReference"') + type: Literal[VrsTypes.SEQ_REF] = Field(VrsTypes.SEQ_REF, description=f'MUST be "{VrsTypes.SEQ_REF.value}"') refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field( ..., description='A `GA4GH RefGet ` identifier for the referenced sequence, using the sha512t24u digest.', @@ -399,7 +416,7 @@ class ga4gh(_ValueObject.ga4gh): class SequenceLocation(_Ga4ghIdentifiableObject): """A `Location` defined by an interval on a referenced `Sequence`.""" - type: Literal['SequenceLocation'] = Field('SequenceLocation', description='MUST be "SequenceLocation"') + type: Literal[VrsTypes.SEQ_LOC] = Field(VrsTypes.SEQ_LOC, description=f'MUST be "{VrsTypes.SEQ_LOC.value}"') sequenceReference: Optional[Union[IRI, SequenceReference]] = Field( None, description='A reference to a `Sequence` on which the location is defined.' ) @@ -449,7 +466,7 @@ class _VariationBase(_Ga4ghIdentifiableObject): class Allele(_VariationBase): """The state of a molecule at a `Location`.""" - type: Literal['Allele'] = Field('Allele', description='MUST be "Allele"') + type: Literal[VrsTypes.ALLELE] = Field(VrsTypes.ALLELE, description=f'MUST be "{VrsTypes.ALLELE.value}"') location: Union[IRI, SequenceLocation] = Field( ..., description='The location of the Allele' ) @@ -469,7 +486,7 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh): class CisPhasedBlock(_VariationBase): """An ordered set of co-occurring `Variation` on the same molecule.""" - type: Literal['CisPhasedBlock'] = Field('CisPhasedBlock', description='MUST be "CisPhasedBlock"') + type: Literal[VrsTypes.CIS_PHASED_BLOCK] = Field(VrsTypes.CIS_PHASED_BLOCK, description=f'MUST be "{VrsTypes.CIS_PHASED_BLOCK.value}"') members: List[Union[Allele, IRI]] = Field( ..., description='A list of `Alleles` that are found in-cis on a shared molecule.', @@ -502,7 +519,7 @@ class Adjacency(_VariationBase): potentially with an intervening linker sequence. """ - type: Literal['Adjacency'] = Field('Adjacency', description='MUST be "Adjacency"') + type: Literal[VrsTypes.ADJACENCY] = Field(VrsTypes.ADJACENCY, description=f'MUST be "{VrsTypes.ADJACENCY.value}"') adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field( ..., description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.", @@ -530,7 +547,7 @@ class SequenceTerminus(_VariationBase): is not allowed and it removes the unnecessary array structure. """ - type: Literal["SequenceTerminus"] = Field("SequenceTerminus", description='MUST be "SequenceTerminus"') + type: Literal[VrsTypes.SEQ_TERMINUS] = Field(VrsTypes.SEQ_TERMINUS, description=f'MUST be "{VrsTypes.SEQ_TERMINUS.value}"') location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): @@ -546,7 +563,7 @@ class DerivativeSequence(_VariationBase): sequence composed from multiple sequence adjacencies. """ - type: Literal["DerivativeSequence"] = Field("DerivativeSequence", description='MUST be "DerivativeSequence"') + type: Literal[VrsTypes.DERIVATIVE_SEQ] = Field(VrsTypes.DERIVATIVE_SEQ, description=f'MUST be "{VrsTypes.DERIVATIVE_SEQ.value}"') components: List[Union[IRI, Adjacency, Allele, SequenceTerminus, CisPhasedBlock]] = Field( ..., description="The sequence components that make up the derivative sequence.", @@ -580,7 +597,7 @@ class CopyNumberCount(_CopyNumber): (e.g. genome, cell, etc.). """ - type: Literal['CopyNumberCount'] = Field('CopyNumberCount', description='MUST be "CopyNumberCount"') + type: Literal[VrsTypes.CN_COUNT] = Field(VrsTypes.CN_COUNT, description=f'MUST be "{VrsTypes.CN_COUNT.value}"') copies: Union[Range, int] = Field( ..., description='The integral number of copies of the subject in a system' ) @@ -599,7 +616,7 @@ class CopyNumberChange(_CopyNumber): (e.g. genome, cell, etc.) relative to a baseline ploidy. """ - type: Literal['CopyNumberChange'] = Field('CopyNumberChange', description='MUST be "CopyNumberChange"') + type: Literal[VrsTypes.CN_CHANGE] = Field(VrsTypes.CN_CHANGE, description=f'MUST be "{VrsTypes.CN_CHANGE.value}"') copyChange: CopyChange = Field( ..., description='MUST be one of "efo:0030069" (complete genomic loss), "efo:0020073" (high-level loss), "efo:0030068" (low-level loss), "efo:0030067" (loss), "efo:0030064" (regional base ploidy), "efo:0030070" (gain), "efo:0030071" (low-level gain), "efo:0030072" (high-level gain).', From 5c1a9d456ec53558c4b5c8b35005aae3ab12dadf Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 10 Jul 2024 10:44:55 -0400 Subject: [PATCH 4/4] refactor: remove _internal directories (#421) close #407 * Remove _internal directories and move files up a directory --- src/ga4gh/core/__init__.py | 36 ++++- src/ga4gh/core/_internal/__init__.py | 0 src/ga4gh/core/{_internal => }/digests.py | 0 src/ga4gh/core/domain_models.py | 142 ++++++++++++++++++ src/ga4gh/core/{_internal => }/enderef.py | 0 .../{_internal/models.py => entity_models.py} | 137 +---------------- src/ga4gh/core/{_internal => }/identifiers.py | 0 src/ga4gh/core/{_internal => }/pydantic.py | 0 src/ga4gh/vrs/__init__.py | 11 +- src/ga4gh/vrs/{_internal => }/enderef.py | 0 src/ga4gh/vrs/{_internal => }/models.py | 4 +- src/ga4gh/vrs/normalize.py | 2 +- 12 files changed, 187 insertions(+), 145 deletions(-) delete mode 100644 src/ga4gh/core/_internal/__init__.py rename src/ga4gh/core/{_internal => }/digests.py (100%) create mode 100644 src/ga4gh/core/domain_models.py rename src/ga4gh/core/{_internal => }/enderef.py (100%) rename src/ga4gh/core/{_internal/models.py => entity_models.py} (78%) rename src/ga4gh/core/{_internal => }/identifiers.py (100%) rename src/ga4gh/core/{_internal => }/pydantic.py (100%) rename src/ga4gh/vrs/{_internal => }/enderef.py (100%) rename src/ga4gh/vrs/{_internal => }/models.py (99%) diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index 77466e15..53c4edb9 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -2,20 +2,44 @@ """ -import warnings from importlib.metadata import version, PackageNotFoundError -from ._internal.digests import sha512t24u -from ._internal.enderef import ga4gh_enref, ga4gh_deref -from ._internal.identifiers import ( +from .digests import sha512t24u +from .enderef import ga4gh_enref, ga4gh_deref +from .identifiers import ( ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier, parse_ga4gh_identifier, VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP ) -from ._internal.pydantic import ( +from .pydantic import ( is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy ) -from ._internal import models as common_models +from . import entity_models, domain_models + +__all__ = [ + "sha512t24u", + "ga4gh_enref", + "ga4gh_deref", + "ga4gh_digest", + "ga4gh_identify", + "ga4gh_serialize", + "is_ga4gh_identifier", + "parse_ga4gh_identifier", + "VrsObjectIdentifierIs", + "use_ga4gh_compute_identifier_when", + "CURIE_NAMESPACE", + "CURIE_SEP", + "GA4GH_PREFIX_SEP", + "GA4GH_IR_REGEXP", + "GA4GH_DIGEST_REGEXP", + "is_pydantic_instance", + "is_curie_type", + "is_ga4gh_identifiable", + "is_literal", + "pydantic_copy", + "entity_models", + "domain_models" +] try: __version__ = version(__name__) diff --git a/src/ga4gh/core/_internal/__init__.py b/src/ga4gh/core/_internal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/ga4gh/core/_internal/digests.py b/src/ga4gh/core/digests.py similarity index 100% rename from src/ga4gh/core/_internal/digests.py rename to src/ga4gh/core/digests.py diff --git a/src/ga4gh/core/domain_models.py b/src/ga4gh/core/domain_models.py new file mode 100644 index 00000000..d6d0918f --- /dev/null +++ b/src/ga4gh/core/domain_models.py @@ -0,0 +1,142 @@ +"""GKS Common Library Domain Entity models + +**This module should not be imported directly.** + +Instead, users should use one of the following: + + * `from ga4gh.core import domain_models`, and refer to models with the + abbreviated name, e.g., `domain_models.Gene` (recommended) + + * `import ga4gh.core`, and refer to models using the fully-qualified + module name, e.g., `ga4gh.core.domain_models.Gene` +""" +from typing import Literal, Union, List +from enum import Enum + +from pydantic import Field, RootModel + +from ga4gh.core.entity_models import _DomainEntity + + +class CommonDomainType(str, Enum): + """Define GKS Common Domain Entity types""" + + PHENOTYPE = "Phenotype" + DISEASE = "Disease" + TRAIT_SET = "TraitSet" + TR_ACTION = "TherapeuticAction" + TR_AGENT = "TherapeuticAgent" + TR_SUB = "TherapeuticSubstituteGroup" + TR_COMB = "CombinationTherapy" + GENE = "Gene" + +class Phenotype(_DomainEntity): + """An observable characteristic or trait of an organism.""" + + type: Literal[CommonDomainType.PHENOTYPE] = Field( + CommonDomainType.PHENOTYPE, + description=f'MUST be "{CommonDomainType.PHENOTYPE.value}".' + ) + + +class Disease(_DomainEntity): + """A particular abnormal condition that negatively affects the structure or function + of all or part of an organism and is not immediately due to any external injury. + """ + + type: Literal[CommonDomainType.DISEASE] = Field( + CommonDomainType.DISEASE, + description=f'MUST be "{CommonDomainType.DISEASE.value}".' + ) + + +class TraitSet(_DomainEntity): + """A set of phenotype and/or disease concepts that together constitute a condition.""" + + type: Literal[CommonDomainType.TRAIT_SET] = Field( + CommonDomainType.TRAIT_SET, + description=f'MUST be "{CommonDomainType.TRAIT_SET.value}".' + ) + traits: List[Union[Disease, Phenotype]] = Field( + ..., + min_length=2 + ) + + +class Condition(RootModel): + """A disease or other medical disorder.""" + + root: Union[TraitSet, Disease, Phenotype] = Field( + ..., + json_schema_extra={'description': 'A disease or other medical disorder.'}, + discriminator='type', + ) + + +class TherapeuticAction(_DomainEntity): + """A therapeutic action taken that is intended to alter or stop a pathologic process.""" + + type: Literal[CommonDomainType.TR_ACTION] = Field( + CommonDomainType.TR_ACTION, + description=f'MUST be "{CommonDomainType.TR_ACTION.value}".' + ) + + +class TherapeuticAgent(_DomainEntity): + """An administered therapeutic agent that is intended to alter or stop a pathologic process.""" + + type: Literal[CommonDomainType.TR_AGENT] = Field( + CommonDomainType.TR_AGENT, + description=f'MUST be "{CommonDomainType.TR_AGENT.value}".' + ) + + +class TherapeuticSubstituteGroup(_DomainEntity): + """A group of therapeutic procedures that may be treated as substitutes for one another.""" + + type: Literal[CommonDomainType.TR_SUB] = Field( + CommonDomainType.TR_SUB, + description=f'MUST be "{CommonDomainType.TR_SUB.value}".' + ) + substitutes: List[Union[TherapeuticAction, TherapeuticAgent]] = Field( + ..., + description='The individual therapeutic procedures that may be treated as substitutes.', + min_length=2 + ) + + +class CombinationTherapy(_DomainEntity): + """A therapeutic procedure that involves multiple different therapeutic procedures + performed in combination. + """ + + type: Literal[CommonDomainType.TR_COMB] = Field( + CommonDomainType.TR_COMB, + description=f'MUST be "{CommonDomainType.TR_COMB.value}".' + ) + components: List[Union[TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent]] = Field( + ..., + description='The individual therapeutic procedure components that constitute the combination therapy.', + min_length=2 + ) + + +class TherapeuticProcedure(RootModel): + """An action or administration of therapeutic agents to produce an effect that is + intended to alter or stop a pathologic process. + """ + + root: Union[CombinationTherapy, TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent] = Field( + ..., + json_schema_extra={'description': 'An action or administration of therapeutic agents to produce an effect that is intended to alter or stop a pathologic process.'}, + discriminator='type', + ) + + +class Gene(_DomainEntity): + """A basic physical and functional unit of heredity.""" + + type: Literal[CommonDomainType.GENE] = Field( + CommonDomainType.GENE, + description=f'MUST be "{CommonDomainType.GENE.value}".' + ) diff --git a/src/ga4gh/core/_internal/enderef.py b/src/ga4gh/core/enderef.py similarity index 100% rename from src/ga4gh/core/_internal/enderef.py rename to src/ga4gh/core/enderef.py diff --git a/src/ga4gh/core/_internal/models.py b/src/ga4gh/core/entity_models.py similarity index 78% rename from src/ga4gh/core/_internal/models.py rename to src/ga4gh/core/entity_models.py index 4ab17edc..8921b284 100644 --- a/src/ga4gh/core/_internal/models.py +++ b/src/ga4gh/core/entity_models.py @@ -1,14 +1,14 @@ -"""GKS Common Library models +"""GKS Common Library Entity models **This module should not be imported directly.** Instead, users should use one of the following: - * `from ga4gh.core import common_models`, and refer to models with the - abbreviated name, e.g., `common_models.Gene` (recommended) + * `from ga4gh.core import entity_models`, and refer to models with the + abbreviated name, e.g., `entity_models.Method` (recommended) * `import ga4gh.core`, and refer to models using the fully-qualified - module name, e.g., `ga4gh.core.common_models.Gene` + module name, e.g., `ga4gh.core.entity_models.Method` """ from __future__ import annotations import datetime @@ -326,132 +326,3 @@ class Method(_InformationEntity): description="A more specific type of entity the method represents (e.g. Variant Interpretation Guideline, Experimental Protocol)", ) license: Optional[str] = Field(None, description="A particular license that dictates legal permissions for how a published method (e.g. an experimental protocol, workflow specification, curation guideline) can be used.") - - -######################################### -# GKS Common Domain Entities -######################################### - - -class CommonDomainType(str, Enum): - """Define GKS Common Domain Entity types""" - - PHENOTYPE = "Phenotype" - DISEASE = "Disease" - TRAIT_SET = "TraitSet" - TR_ACTION = "TherapeuticAction" - TR_AGENT = "TherapeuticAgent" - TR_SUB = "TherapeuticSubstituteGroup" - TR_COMB = "CombinationTherapy" - GENE = "Gene" - -class Phenotype(_DomainEntity): - """An observable characteristic or trait of an organism.""" - - type: Literal[CommonDomainType.PHENOTYPE] = Field( - CommonDomainType.PHENOTYPE, - description=f'MUST be "{CommonDomainType.PHENOTYPE.value}".' - ) - - -class Disease(_DomainEntity): - """A particular abnormal condition that negatively affects the structure or function - of all or part of an organism and is not immediately due to any external injury. - """ - - type: Literal[CommonDomainType.DISEASE] = Field( - CommonDomainType.DISEASE, - description=f'MUST be "{CommonDomainType.DISEASE.value}".' - ) - - -class TraitSet(_DomainEntity): - """A set of phenotype and/or disease concepts that together constitute a condition.""" - - type: Literal[CommonDomainType.TRAIT_SET] = Field( - CommonDomainType.TRAIT_SET, - description=f'MUST be "{CommonDomainType.TRAIT_SET.value}".' - ) - traits: List[Union[Disease, Phenotype]] = Field( - ..., - min_length=2 - ) - - -class Condition(RootModel): - """A disease or other medical disorder.""" - - root: Union[TraitSet, Disease, Phenotype] = Field( - ..., - json_schema_extra={'description': 'A disease or other medical disorder.'}, - discriminator='type', - ) - - -class TherapeuticAction(_DomainEntity): - """A therapeutic action taken that is intended to alter or stop a pathologic process.""" - - type: Literal[CommonDomainType.TR_ACTION] = Field( - CommonDomainType.TR_ACTION, - description=f'MUST be "{CommonDomainType.TR_ACTION.value}".' - ) - - -class TherapeuticAgent(_DomainEntity): - """An administered therapeutic agent that is intended to alter or stop a pathologic process.""" - - type: Literal[CommonDomainType.TR_AGENT] = Field( - CommonDomainType.TR_AGENT, - description=f'MUST be "{CommonDomainType.TR_AGENT.value}".' - ) - - -class TherapeuticSubstituteGroup(_DomainEntity): - """A group of therapeutic procedures that may be treated as substitutes for one another.""" - - type: Literal[CommonDomainType.TR_SUB] = Field( - CommonDomainType.TR_SUB, - description=f'MUST be "{CommonDomainType.TR_SUB.value}".' - ) - substitutes: List[Union[TherapeuticAction, TherapeuticAgent]] = Field( - ..., - description='The individual therapeutic procedures that may be treated as substitutes.', - min_length=2 - ) - - -class CombinationTherapy(_DomainEntity): - """A therapeutic procedure that involves multiple different therapeutic procedures - performed in combination. - """ - - type: Literal[CommonDomainType.TR_COMB] = Field( - CommonDomainType.TR_COMB, - description=f'MUST be "{CommonDomainType.TR_COMB.value}".' - ) - components: List[Union[TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent]] = Field( - ..., - description='The individual therapeutic procedure components that constitute the combination therapy.', - min_length=2 - ) - - -class TherapeuticProcedure(RootModel): - """An action or administration of therapeutic agents to produce an effect that is - intended to alter or stop a pathologic process. - """ - - root: Union[CombinationTherapy, TherapeuticSubstituteGroup, TherapeuticAction, TherapeuticAgent] = Field( - ..., - json_schema_extra={'description': 'An action or administration of therapeutic agents to produce an effect that is intended to alter or stop a pathologic process.'}, - discriminator='type', - ) - - -class Gene(_DomainEntity): - """A basic physical and functional unit of heredity.""" - - type: Literal[CommonDomainType.GENE] = Field( - CommonDomainType.GENE, - description=f'MUST be "{CommonDomainType.GENE.value}".' - ) diff --git a/src/ga4gh/core/_internal/identifiers.py b/src/ga4gh/core/identifiers.py similarity index 100% rename from src/ga4gh/core/_internal/identifiers.py rename to src/ga4gh/core/identifiers.py diff --git a/src/ga4gh/core/_internal/pydantic.py b/src/ga4gh/core/pydantic.py similarity index 100% rename from src/ga4gh/core/_internal/pydantic.py rename to src/ga4gh/core/pydantic.py diff --git a/src/ga4gh/vrs/__init__.py b/src/ga4gh/vrs/__init__.py index fdd445d7..adbec052 100644 --- a/src/ga4gh/vrs/__init__.py +++ b/src/ga4gh/vrs/__init__.py @@ -5,10 +5,15 @@ from importlib.metadata import version, PackageNotFoundError from .normalize import normalize -from ._internal.enderef import vrs_deref, vrs_enref -from ._internal import models +from .enderef import vrs_deref, vrs_enref +from . import models -__all__ = """models normalize schema_path vrs_deref vrs_enref""".split() +__all__ = [ + "normalize", + "vrs_deref", + "vrs_enref", + "models" +] try: __version__ = version(__name__) diff --git a/src/ga4gh/vrs/_internal/enderef.py b/src/ga4gh/vrs/enderef.py similarity index 100% rename from src/ga4gh/vrs/_internal/enderef.py rename to src/ga4gh/vrs/enderef.py diff --git a/src/ga4gh/vrs/_internal/models.py b/src/ga4gh/vrs/models.py similarity index 99% rename from src/ga4gh/vrs/_internal/models.py rename to src/ga4gh/vrs/models.py index 44ca885b..721d8da2 100644 --- a/src/ga4gh/vrs/_internal/models.py +++ b/src/ga4gh/vrs/models.py @@ -19,11 +19,11 @@ from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer -from ga4gh.core._internal.pydantic import ( +from ga4gh.core.pydantic import ( is_ga4gh_identifiable, getattr_in ) -from ga4gh.core._internal.models import IRI, Expression, _DomainEntity +from ga4gh.core.entity_models import IRI, Expression, _DomainEntity def flatten(vals): diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 33dc70a5..41389937 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -11,7 +11,7 @@ from bioutils.normalize import normalize as _normalize, NormalizationMode from ga4gh.core import is_pydantic_instance, ga4gh_digest, pydantic_copy -from ._internal import models +from . import models from .dataproxy import SequenceProxy