Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update gks-common/vrs model #391

Merged
merged 2 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 37 additions & 11 deletions src/ga4gh/core/_internal/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
"""GKS Common Library models

**This module should not be imported directly.**

Instead, users should use one of the following:
Expand All @@ -16,6 +17,11 @@
from ga4gh.core import GA4GH_IR_REGEXP


#########################################
# gks-common core
#########################################


class Relation(Enum):
"""A mapping relation between concepts as defined by the Simple Knowledge
Organization System (SKOS).
Expand Down Expand Up @@ -44,6 +50,12 @@ class Code(RootModel):


class IRI(RootModel):
"""An IRI Reference (either an IRI or a relative-reference), according to `RFC3986
section 4.1 <https://datatracker.ietf.org/doc/html/rfc3986#section-4.1>` and
`RFC3987 section 2.1 <https://datatracker.ietf.org/doc/html/rfc3987#section-2.1>`.
MAY be a JSON Pointer as an IRI fragment, as described by `RFC6901 section 6
<https://datatracker.ietf.org/doc/html/rfc6901#section-6>`.
"""

def __hash__(self):
return self.root.__hash__()
Expand Down Expand Up @@ -152,6 +164,10 @@ class _DomainEntity(_MappableEntity):
)


#########################################
# gks-common conditions
#########################################

class Phenotype(_DomainEntity):
"""An observable characteristic or trait of an organism."""

Expand Down Expand Up @@ -185,6 +201,21 @@ class TraitSet(_DomainEntity):
)


class Condition(RootModel):
"""A disease or other medical disorder."""

root: Union[Disease, Phenotype, TraitSet] = Field(
...,
json_schema_extra={'description': 'A disease or other medical disorder.'},
discriminator='type',
)


#########################################
# gks-common therapeutics
#########################################


class TherapeuticAction(_DomainEntity):
"""A therapeutic action taken that is intended to alter or stop a pathologic process."""

Expand Down Expand Up @@ -237,16 +268,6 @@ class CombinationTherapy(_DomainEntity):
)


class Condition(RootModel):
"""A disease or other medical disorder."""

root: Union[Disease, Phenotype, TraitSet] = Field(
...,
json_schema_extra={'description': 'A disease or other medical disorder.'},
discriminator='type',
)


class TherapeuticProcedure(RootModel):
"""An action or administration of therapeutic agents to produce an effect that is
intended to alter or stop a pathologic process.
Expand All @@ -259,6 +280,11 @@ class TherapeuticProcedure(RootModel):
)


#########################################
# gks-common therapeutics
#########################################


class Gene(_DomainEntity):
"""A basic physical and functional unit of heredity."""

Expand Down
159 changes: 127 additions & 32 deletions src/ga4gh/vrs/_internal/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
"""GA4GH VRS models

**This module should not be imported directly.**

Instead, users should use one of the following:
Expand Down Expand Up @@ -148,11 +149,15 @@ class Syntax(Enum):


class ResidueAlphabet(Enum):
"""Define constraints for residue alphabet"""

AA = 'aa'
NA = 'na'


class CopyChange(Enum):
"""Define constraints for copy change"""

EFO_0030069 = 'efo:0030069'
EFO_0020073 = 'efo:0020073'
EFO_0030068 = 'efo:0030068'
Expand Down Expand Up @@ -295,8 +300,13 @@ class Expression(BaseModel):
value: str
syntax_version: Optional[str] = None

#########################################
# vrs numerics, comparators, and ranges
#########################################

class Range(RootModel):
"""An inclusive range of values bounded by one or more integers."""

root: List[Optional[int]] = Field(
...,
json_schema_extra={
Expand All @@ -308,6 +318,12 @@ class Range(RootModel):


class Residue(RootModel):
"""A character representing a specific residue (i.e., molecular species) or
groupings of these ("ambiguity codes"), using `one-letter IUPAC abbreviations
<https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes>`_
for nucleic acids and amino acids.
"""

root: constr(pattern=r'[A-Z*\-]') = Field(
...,
json_schema_extra={
Expand All @@ -317,6 +333,12 @@ class Residue(RootModel):


class SequenceString(RootModel):
"""A character string of Residues that represents a biological sequence using the
conventional sequence order (5'-to-3' for nucleic acid sequences, and
amino-to-carboxyl for amino acid sequences). IUPAC ambiguity codes are permitted in
Sequence Strings.
"""

root: constr(pattern=r'^[A-Z*\-]*$') = Field(
...,
json_schema_extra={
Expand All @@ -325,33 +347,19 @@ class SequenceString(RootModel):
)


class SequenceReference(_ValueObject):
model_config = ConfigDict(
use_enum_values=True
)

type: Literal['SequenceReference'] = Field('SequenceReference', description='MUST be "SequenceReference"')
refgetAccession: constr(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$') = Field(
...,
description='A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.',
)
residueAlphabet: Optional[ResidueAlphabet] = None

class ga4gh(_ValueObject.ga4gh):
keys = [
'refgetAccession',
'type'
]
#########################################
# vrs sequence expression
#########################################


class LengthExpression(_ValueObject):
"""An expression of a DNA, RNA, or protein polymer of known length but unspecified sequence."""
"""A sequence expressed only by its length."""

type: Literal['ReferenceLengthExpression'] = Field(
'ReferenceLengthExpression', description='MUST be "ReferenceLengthExpression"'
type: Literal['LengthExpression'] = Field(
'LengthExpression', description='MUST be "LengthExpression"'
)
length: Union[Range, int] = Field(
..., description='The number of residues of the expressed sequence.'
length: Optional[Union[Range, int]] = Field(
None
)

class ga4gh(_ValueObject.ga4gh):
Expand All @@ -362,7 +370,7 @@ class ga4gh(_ValueObject.ga4gh):


class ReferenceLengthExpression(_ValueObject):
"""An expression sequence derived from a reference."""
"""An expression of a length of a sequence from a repeating reference."""

type: Literal['ReferenceLengthExpression'] = Field(
'ReferenceLengthExpression', description='MUST be "ReferenceLengthExpression"'
Expand Down Expand Up @@ -400,21 +408,47 @@ class ga4gh(_ValueObject.ga4gh):
]


#########################################
# vrs location
#########################################


class SequenceReference(_ValueObject):
"""A sequence of nucleic or amino acid character codes."""

model_config = ConfigDict(
use_enum_values=True
)

type: Literal['SequenceReference'] = Field('SequenceReference', description='MUST be "SequenceReference"')
refgetAccession: constr(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$') = Field(
...,
description='A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.',
)
residueAlphabet: Optional[ResidueAlphabet] = None

class ga4gh(_ValueObject.ga4gh):
keys = [
'refgetAccession',
'type'
]


class SequenceLocation(_Ga4ghIdentifiableObject):
"""A `Location` defined by an interval on a referenced `Sequence`."""

type: Literal['SequenceLocation'] = Field('SequenceLocation', description='MUST be "SequenceLocation"')
sequenceReference: Optional[Union[IRI, SequenceReference]] = Field(
None, description='A SequenceReference.'
)
start: Union[Range, int] = Field(
...,
start: Optional[Union[Range, int]] = Field(
None,
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than the value of `end`.',
)
end: Union[Range, int] = Field(
...,
end: Optional[Union[Range, int]] = Field(
None,
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.',

)
def get_refget_accession(self):
if isinstance(self.sequenceReference, SequenceReference):
Expand All @@ -433,6 +467,9 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
'type'
]

#########################################
# base variation
#########################################


class _VariationBase(_Ga4ghIdentifiableObject):
Expand All @@ -441,7 +478,49 @@ class _VariationBase(_Ga4ghIdentifiableObject):
expressions: Optional[List[Expression]] = None


#########################################
# vrs structural variation (under active discussion)
#########################################


class Adjacency(_VariationBase):
"""The `Adjacency` class can represent either the termination of a sequence or the
adjoining of the end of a sequence with the beginning of an adjacent sequence,
potentially with an intervening linker sequence.
"""

model_config = ConfigDict(
use_enum_values=True
)

type: Literal['Adjacency'] = Field('Adjacency', description='MUST be "Adjacency"')
adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field(
...,
description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.",
min_length=1,
max_length=2,
)
linker: Optional[Union[LiteralSequenceExpression, ReferenceLengthExpression]] = Field(
None,
description="he sequence found between adjoined sequences."
)

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'AJ'
keys = [
'adjoinedSequences',
'linker',
'type'
]


#########################################
# vrs molecular variation
#########################################


class Allele(_VariationBase):
"""The state of a molecule at a Location."""

type: Literal['Allele'] = Field('Allele', description='MUST be "Allele"')
location: Union[IRI, SequenceLocation] = Field(
Expand All @@ -461,12 +540,12 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):


class Haplotype(_VariationBase):
"""A set of non-overlapping Allele members that co-occur on the same molecule."""
"""An ordered set of co-occurring Variation on the same molecule."""

type: Literal['Haplotype'] = Field('Haplotype', description='MUST be "Haplotype"')
members: List[Union[Allele, IRI]] = Field(
members: List[Union[Adjacency, Allele, IRI]] = Field(
...,
description='A list of Alleles (or IRI references to `Alleles`) that comprise a Haplotype. Since each `Haplotype` member MUST be an `Allele`, and all members MUST share a common `SequenceReference`, implementations MAY use a compact representation of Haplotype that omits type and `SequenceReference` information in individual Haplotype members. Implementations MUST transform compact `Allele` representations into an `Allele` when computing GA4GH identifiers.',
description='A list of Alleles and Adjacencies that comprise a Haplotype. Members must share the same reference sequence as adjacent members. Alleles should not have overlapping or adjacent coordinates with neighboring Alleles. Neighboring alleles should be ordered by ascending coordinates, unless represented on a DNA inversion (following an Adjacency with end-defined adjoinedSequences), in which case they should be ordered in descending coordinates. Sequence references MUST be consistent for all members between and including the end of one Adjacency and the beginning of another.',
min_length=2,
)

Expand All @@ -484,6 +563,11 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
]


#########################################
# vrs systemic variation
#########################################


class _CopyNumber(_VariationBase):
"""A measure of the copies of a `Location` within a system (e.g. genome, cell, etc.)"""

Expand Down Expand Up @@ -535,6 +619,11 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
]


#########################################
# vrs kinds of variation, expression, and location
#########################################


class MolecularVariation(RootModel):
"""A variation on a contiguous molecule."""

Expand All @@ -547,6 +636,8 @@ class MolecularVariation(RootModel):
)

class SequenceExpression(RootModel):
"""An expression describing a Sequence."""

root: Union[LiteralSequenceExpression, ReferenceLengthExpression] = Field(
...,
json_schema_extra={'description': 'An expression describing a Sequence.'},
Expand All @@ -555,6 +646,8 @@ class SequenceExpression(RootModel):


class Location(RootModel):
"""A contiguous segment of a biological sequence."""

root: SequenceLocation = Field(
...,
json_schema_extra={
Expand All @@ -565,6 +658,8 @@ class Location(RootModel):


class Variation(RootModel):
"""A representation of the state of one or more biomolecules."""

root: Union[Allele, CopyNumberChange, CopyNumberCount, Haplotype] = Field(
...,
json_schema_extra={
Expand Down
1 change: 1 addition & 0 deletions tests/test_vrs2.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ def test_class_refatt_map():
'_CopyNumber': ['location'],
'CopyNumberCount': ['location'],
'CopyNumberChange': ['location'],
'Adjacency': ['adjoinedSequences'],
}
assert class_refatt_map_expected == models.class_refatt_map

Expand Down
Loading