Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: refactor hgvsTools and support c. translation #366

Merged
merged 25 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cd963d2
Add support for mito `m.` in translate_from
larrybabb Mar 12, 2024
c7138a6
fixed accession seq type mapping NR_ to `n.`
larrybabb Mar 12, 2024
21632fe
remove cassette files that need to be rebuilt due to test updates
larrybabb Mar 12, 2024
4c5f81a
update cassettes + change chr input M to MT
korikuzma Mar 13, 2024
92bd821
Merge branch 'main' into issue-360
korikuzma Mar 14, 2024
4dde86c
use hgvs lib from 225-uncertain-ranges branch
toneillbroad Mar 14, 2024
5928e20
added support for X{MRP} accessions + fix for NM_ sequence coding type
larrybabb Mar 15, 2024
f03963e
fixed test for `coding` sequences
larrybabb Mar 15, 2024
231719a
fixed cassettes for test fix (ugh)
larrybabb Mar 15, 2024
d7c3f1c
and more
larrybabb Mar 15, 2024
ce03f28
reverted [NX]M_ mapping to 'n.' to separate 2nd issue from this PR
larrybabb Mar 18, 2024
db39330
refactor hgvs logic in Translators
larrybabb Mar 20, 2024
dce7d24
added logic to handle from_hgvs for `c.` expressions
larrybabb Mar 21, 2024
653002e
from/to hgvs for alleleTranslator working again
larrybabb Mar 25, 2024
36779ec
moved normalizer rest dp to notebooks. reran all tests/cassettes
larrybabb Mar 26, 2024
152a4b9
fixed incorrect UTA designation
larrybabb Mar 26, 2024
a34086e
remove all cassettes for rebuild - to avoid conflict resolution step
larrybabb Mar 26, 2024
e1c31c8
Merge branch 'main' into issue-365
larrybabb Mar 26, 2024
bfa9e47
fixed dp to rest_dp for tests
larrybabb Mar 26, 2024
b3155a3
added 2 missing cassettes
larrybabb Mar 27, 2024
39db9ad
created cassette for troubled test case
larrybabb Mar 27, 2024
e5d3435
reverted hgvs package to release 1.4
larrybabb Mar 27, 2024
f7a350e
Merge branch 'main' into issue-365
larrybabb Mar 27, 2024
035bdf1
PR review suggestions
larrybabb Mar 28, 2024
6eeec08
fix translator init
larrybabb Mar 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ extras =
psycopg2-binary
biocommons.seqrepo>=0.5.1
bioutils>=0.5.2
hgvs@git+https://github.com/biocommons/hgvs@225-uncertain-ranges
hgvs>=1.4
requests
dill~=0.3.7
click
Expand Down
58 changes: 9 additions & 49 deletions src/ga4gh/vrs/_internal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,15 @@ class SequenceLocation(_Ga4ghIdentifiableObject):
end: Union[Range, int] = Field(
...,
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.',

)
def get_refget_accession(self):
if isinstance(self.sequenceReference, SequenceReference):
return self.sequenceReference.refgetAccession
elif isinstance(self.sequenceReference, IRI):
return self.sequenceReference.root
else:
return None

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'SL'
Expand All @@ -426,6 +434,7 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
]



class _VariationBase(_Ga4ghIdentifiableObject):
"""Base class for variation"""

Expand Down Expand Up @@ -526,27 +535,6 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
]


# class GenotypeMember(_ValueObject):
# """A class for expressing the count of a specific `MolecularVariation` present
# in-trans at a genomic locus represented by a `Genotype`.
# """
#
# type: Literal['GenotypeMember'] = Field('GenotypeMember', description='MUST be "GenotypeMember".')
# count: Union[Range, int] = Field(
# ..., description='The number of copies of the `variation` at a Genotype locus.'
# )
# variation: Union[Allele, Haplotype] = Field(
# ..., description='A MolecularVariation at a Genotype locus.'
# )
#
# class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
# keys = [
# 'type',
# 'count',
# 'variation'
# ]


class MolecularVariation(RootModel):
"""A variation on a contiguous molecule."""

Expand All @@ -558,34 +546,6 @@ class MolecularVariation(RootModel):
discriminator='type'
)


# class Genotype(_VariationBase):
# """A quantified set of _in-trans_ `MolecularVariation` at a genomic locus."""
#
# type: Literal['Genotype'] = Field(
# 'Genotype',
# description='MUST be "Genotype"'
# )
# # TODO members temporarily typed as List instead of Set + validate unique items
# members: List[GenotypeMember] = Field(
# ...,
# description='Each GenotypeMember in `members` describes a MolecularVariation and the count of that variation at the locus.',
# min_length=1,
# )
# count: Union[Range, int] = Field(
# ...,
# description='The total number of copies of all MolecularVariation at this locus, MUST be greater than or equal to the sum of GenotypeMember copy counts. If greater than the total counts, this implies additional MolecularVariation that are expected to exist but are not explicitly indicated.',
# )
#
# class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
# prefix = 'GT'
# keys = [
# 'count',
# 'members',
# 'type'
# ]


class SequenceExpression(RootModel):
root: Union[LiteralSequenceExpression, ReferenceLengthExpression] = Field(
...,
Expand Down
84 changes: 81 additions & 3 deletions src/ga4gh/vrs/dataproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from abc import ABC, abstractmethod
from typing import Tuple
from collections.abc import Sequence
import datetime
import functools
Expand Down Expand Up @@ -65,6 +66,36 @@ def get_metadata(self, identifier):

"""

@staticmethod
def extract_sequence_type(alias: str) -> str:
"""
The purpose of this function is to provide a convenient way to extract the sequence type from an accession by matching its prefix to a known set of prefixes.

Args:
alias (str): The accession string.

Returns:
str or None: The sequence type associated with the accession string, or None if no matching prefix is found.
"""

prefix_dict = {
"refseq:NM_": "c",
"refseq:NC_012920": "m",
"refseq:NG_": "g",
"refseq:NC_00": "g",
"refseq:NW_": "g",
"refseq:NR_": "n",
"refseq:NP_": "p",
"refseq:XM_": "c",
"refseq:XR_": "n",
"refseq:XP_": "p",
"GRCh": "g",
}

for prefix, seq_type in prefix_dict.items():
if alias.startswith(prefix):
return seq_type
return None

@functools.lru_cache()
def translate_sequence_identifier(self, identifier, namespace=None):
Expand All @@ -88,6 +119,56 @@ def translate_sequence_identifier(self, identifier, namespace=None):
nsd = namespace + ":"
aliases = [a for a in aliases if a.startswith(nsd)]
return aliases

def derive_refget_accession(self, ac: str):
"""Derive the refget accession from a public accession identifier

:param ac: public accession in simple or curie form from which to derive the refget accession
:return: Refget Accession if found
"""

if ac is None:
return None

if ":" not in ac[1:]:
# always coerce the namespace if none provided
ac = coerce_namespace(ac)

refget_accession = None
try:
aliases = self.translate_sequence_identifier(ac, namespace="ga4gh")
except KeyError:
_logger.error("KeyError when getting refget accession: %s", ac)
else:
if aliases:
refget_accession = aliases[0].split("ga4gh:")[-1]

return refget_accession

def is_valid_ref_seq(
self, sequence_id: str, start_pos: int, end_pos: int, ref: str
) -> Tuple[bool, str]:
"""Return wether or not the expected reference sequence matches the actual
reference sequence

:param sequence_id: Sequence ID to use
:param start_pos: Start pos (inter-residue) on the sequence_id
:param end_pos: End pos (inter-residue) on the sequence_id
:param ref: The expected reference sequence on the sequence_id given the
start_pos and end_pos
:return: Tuple containing whether or not actual reference sequence matches
the expected reference sequence and error message if mismatch
"""
actual_ref = self.get_sequence(sequence_id, start_pos, end_pos)
is_valid = actual_ref == ref
err_msg = ""
if not is_valid:
err_msg = (
f"Expected reference sequence {ref} on {sequence_id} at positions "
f"({start_pos}, {end_pos}) but found {actual_ref}"
)
_logger.warning(err_msg)
return is_valid, err_msg
larrybabb marked this conversation as resolved.
Show resolved Hide resolved


class _SeqRepoDataProxyBase(_DataProxy):
Expand Down Expand Up @@ -224,16 +305,13 @@ def _isoformat(o):
# eg: '2015-09-25T23:14:42.588601Z'
return o.isoformat("T") + "Z"


# Future implementations
# * The RefGetDataProxy is waiting on support for sequence lookup by alias
# class RefGetDataProxy(_DataProxy):
# def __init__(self, base_url):
# super().__init__()
# self.base_url = base_url



def create_dataproxy(uri: str = None) -> _DataProxy:
"""Create a dataproxy from uri or GA4GH_VRS_DATAPROXY_URI

Expand Down
Loading
Loading