Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup: resolve pylint warnings #149

Merged
merged 13 commits into from
Mar 8, 2023
10 changes: 8 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pycqa/pylint
rev: v2.15.9
- repo: local
hooks:
- id: pylint
name: pylint
entry: pylint
language: system
types: [python]
require_serial: true
exclude: ^tests/
args:
- src/ga4gh/core
- src/ga4gh/vrs
- --fail-under=10
12 changes: 12 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000

[MESSAGES CONTROL]

# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=fixme,protected-access

[STRING]

Expand Down
2 changes: 1 addition & 1 deletion src/ga4gh/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
parse_ga4gh_identifier)
from ._internal.jsonschema import (build_models, build_class_referable_attribute_map, is_pjs_class, is_pjs_instance,
is_curie, is_identifiable, is_literal, pjs_copy)

# pylint: disable=duplicate-code
try:
__version__ = get_distribution(__name__).version
except DistributionNotFound: # pragma: nocover
Expand Down
7 changes: 4 additions & 3 deletions src/ga4gh/core/_internal/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

# Assume that ga4gh.yaml and vrs.yaml files are in the same directory for now
schema_dir = os.environ.get("VRS_SCHEMA_DIR", pkg_resources.resource_filename(__name__, "data/schema"))
cfg = yaml.safe_load(open(schema_dir + "/ga4gh.yaml"))
with open(schema_dir + "/ga4gh.yaml", "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f)
type_prefix_map_default = cfg["identifiers"]["type_prefix_map"]
namespace = cfg["identifiers"]["namespace"]
curie_sep = cfg["identifiers"]["curie_sep"]
Expand Down Expand Up @@ -144,7 +145,7 @@ def ga4gh_serialize(vro):

"""

def dictify(vro, enref=True):
def dictify(vro, enref=True): # pylint: disable=too-many-return-statements,too-many-branches
"""recursively converts (any) object to dictionary prior to
serialization

Expand Down Expand Up @@ -184,7 +185,7 @@ def dictify(vro, enref=True):
if is_curie_type(vro[0]):
return sorted(dictify(o) for o in vro.data)

arr = list()
arr = []
for o in vro.typed_elems:
d = dictify(o)
if isinstance(d, dict):
Expand Down
2 changes: 1 addition & 1 deletion src/ga4gh/core/_internal/jsonschema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

def build_models(path, standardize_names=False):
"""load models from json schema at path"""
with open(path, "r") as yaml_file:
with open(path, "r", encoding="utf-8") as yaml_file:
y = yaml.load(yaml_file, yaml.SafeLoader)
builder = pjs.ObjectBuilder(pjs_filter(y))
models = builder.build_classes(standardize_names=standardize_names)
Expand Down
2 changes: 1 addition & 1 deletion src/ga4gh/vrs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@

logging.basicConfig(
korikuzma marked this conversation as resolved.
Show resolved Hide resolved
filename="vrs-python.log",
format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s")
format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s")
4 changes: 2 additions & 2 deletions src/ga4gh/vrs/dataproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _get_sequence(self, identifier, start=None, end=None):
url = self.base_url + f"sequence/{identifier}"
_logger.info("Fetching %s", url)
params = {"start": start, "end": end}
resp = requests.get(url, params=params)
resp = requests.get(url, params=params, timeout=5)
if resp.status_code == 404:
raise KeyError(identifier)
resp.raise_for_status()
Expand All @@ -161,7 +161,7 @@ def _get_sequence(self, identifier, start=None, end=None):
def _get_metadata(self, identifier):
url = self.base_url + f"metadata/{identifier}"
_logger.info("Fetching %s", url)
resp = requests.get(url)
resp = requests.get(url, timeout=5)
if resp.status_code == 404:
raise KeyError(identifier)
resp.raise_for_status()
Expand Down
47 changes: 29 additions & 18 deletions src/ga4gh/vrs/extras/localizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
"""convert named locations into SequenceLocations ("localize") by
reference to external data

"""convert named locations into SequenceLocations ("localize") by reference to external
data
"""

import copy

from bioutils.accessions import coerce_namespace
from bioutils.assemblies import make_name_ac_map
from bioutils.cytobands import get_cytoband_maps
Expand Down Expand Up @@ -45,27 +41,25 @@ def __init__(self):
self._ana_maps = {k: make_name_ac_map(k) for k in assy_name_to_map_name}


def localize_allele(self, allele):
def localize_allele(self, allele, assembly_name = "GRCh38"):
"""Converts VRS Allele's named features to sequence location"""
# copy input variant and replace location
# N.B. deepcopy leads to recursion errors
allele_sl = ga4gh.vrs.models.Variation(**allele.as_dict())
del allele_sl._id
allele_sl.location = self.localize(allele.location)
allele_sl.location = self.localize_named_feature(allele.location, assembly_name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem right to me by the names and descriptions of these methods. Why would updating a sequence location (parent method) have anything to do with a named feature (child method)? My understanding of named features are annotated regions on a sequence, e.g. gene symbols, protein domains, named exons.

I would need to dig deeper into the code to understand if this is actually correct; and if so we need to reconsider the names and descriptions of these functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my understanding of this and the notebook, these methods converts a Chromsome Location or Gene Location to a Sequence Location

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@reece and @ahwagner can you review the changes to the localizer module?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@korikuzma is correct: the "localization" process is intended to convert a named feature to coordinates. I proposed (and I thought that you and Larry agreed) that we needed to be able to project all features onto a common coordinate system. This operation is necessary in order to identify overlapping features, for example.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I think I misunderstood; I thought we were "localizing" a sequence location here, but understand now the idea is to localize a feature or chromosome location. That seems appropriately named to me. Thanks @korikuzma and @reece.

return allele_sl


def localize_named_feature(self, loc, assembly_name):
"""converts named features to sequence locations

"""

"""converts named features to sequence locations"""
assert loc.type._value == "ChromosomeLocation", "Expected a ChromosomeLocation object"

def _get_coords(m, cb):
"""return (start,end) of band `cb` in map `m`"""
if cb is None:
return None
return chr_cb_map[cb][0:2]
return m[cb][0:2]
korikuzma marked this conversation as resolved.
Show resolved Hide resolved

try:
map_name = assy_name_to_map_name[assembly_name]
Expand Down Expand Up @@ -98,7 +92,7 @@ def _get_coords(m, cb):
try:
ac = self._ana_maps[assembly_name][loc.chr]
except KeyError as e:
raise ValueError(f"No accssion for {loc.chr} in assembly {assembly_name}") from e
raise ValueError(f"No accession for {loc.chr} in assembly {assembly_name}") from e

return ga4gh.vrs.models.SequenceLocation(
sequence_id=coerce_namespace(ac),
Expand All @@ -108,9 +102,26 @@ def _get_coords(m, cb):
)





if __name__ == "__main__":
cbl = ga4gh.vrs.models.ChromosomeLocation(chr="11", start="q22.3", end="q23.1")
lr = Localizer()

allele_dict = {
"location": {
"chr":"19",
"interval": {
"end": "q13.32",
"start": "q13.32",
"type": "CytobandInterval"
},
"species_id":"taxonomy:9606",
"type":"ChromosomeLocation"
},
"state": {
"sequence": "T",
"type": "LiteralSequenceExpression"
},
"type": "Allele"
}
allele_vo = ga4gh.vrs.models.Allele(**allele_dict)
allele_with_seq_loc = lr.localize_allele(allele_vo)
print("Allele with SequenceLocation:", allele_with_seq_loc.as_dict())
45 changes: 23 additions & 22 deletions src/ga4gh/vrs/extras/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import hgvs.dataproviders.uta

from ga4gh.core import ga4gh_identify
from ga4gh.vrs import models, normalize
from ga4gh.vrs import models, normalize as do_normalize
from ga4gh.vrs.extras.decorators import lazy_property # this should be relocated
from ga4gh.vrs.utils.hgvs_tools import HgvsTools

Expand All @@ -28,8 +28,6 @@
class ValidationError(Exception):
"""Class for validation errors during translation"""

pass


class Translator:
"""Translates various variation formats to and from GA4GH VRS models
Expand All @@ -48,7 +46,7 @@ class Translator:
spdi_re = re.compile(r"(?P<ac>[^:]+):(?P<pos>\d+):(?P<del_len_or_seq>\w*):(?P<ins_seq>\w*)")


def __init__(self,
def __init__(self, # pylint: disable=too-many-arguments
data_proxy,
default_assembly_name="GRCh38",
translate_sequence_identifiers=True,
Expand Down Expand Up @@ -82,7 +80,7 @@ def translate_from(self, var, fmt=None, require_validation=True):
raise ValueError(f"Unable to parse data as {fmt} variation")
return o

for fmt, t in self.from_translators.items():
for _, t in self.from_translators.items():
o = t(self, var, require_validation=require_validation)
if o:
return o
Expand Down Expand Up @@ -116,7 +114,7 @@ def ensure_allele_is_latest_model(self, allele):
############################################################################
## INTERNAL

def _from_beacon(self, beacon_expr, assembly_name=None, require_validation=True):
def _from_beacon(self, beacon_expr, assembly_name=None, require_validation=True): # pylint: disable=too-many-locals, unused-argument
"""Parse beacon expression into VRS Allele

#>>> a = tlr.from_beacon("19 : 44908822 C > T")
Expand Down Expand Up @@ -158,7 +156,7 @@ def _from_beacon(self, beacon_expr, assembly_name=None, require_validation=True)
return allele


def _from_gnomad(self, gnomad_expr, assembly_name=None, require_validation=True):
def _from_gnomad(self, gnomad_expr, assembly_name=None, require_validation=True): # pylint: disable=unused-argument, too-many-locals
"""Parse gnomAD-style VCF expression into VRS Allele

:param str gnomad_expr: chr-pos-ref-alt
Expand Down Expand Up @@ -208,7 +206,7 @@ def _from_gnomad(self, gnomad_expr, assembly_name=None, require_validation=True)
return allele


def _from_hgvs(self, hgvs_expr, require_validation=True):
def _from_hgvs(self, hgvs_expr, require_validation=True): # pylint: disable=unused-argument
"""parse hgvs into a VRS object (typically an Allele)

#>>> a = tlr.from_hgvs("NM_012345.6:c.22A>T")
Expand All @@ -233,7 +231,7 @@ def _from_hgvs(self, hgvs_expr, require_validation=True):
if not self.hgvs_re.match(hgvs_expr):
return None

sv = self._hgvs_parser.parse_hgvs_variant(hgvs_expr)
sv = self._hgvs_parser.parse_hgvs_variant(hgvs_expr) # pylint: disable=no-member

# prefix accession with namespace
sequence_id = coerce_namespace(sv.ac)
Expand Down Expand Up @@ -277,7 +275,7 @@ def _from_hgvs(self, hgvs_expr, require_validation=True):
return allele


def _from_spdi(self, spdi_expr, require_validation=True):
def _from_spdi(self, spdi_expr, require_validation=True): # pylint: disable=unused-argument

"""Parse SPDI expression in to a GA4GH Allele

Expand Down Expand Up @@ -322,7 +320,7 @@ def _from_spdi(self, spdi_expr, require_validation=True):
return allele


def _from_vrs(self, var, require_validation=True):
def _from_vrs(self, var, require_validation=True): # pylint: disable=unused-argument
"""convert from dict representation of VRS JSON to VRS object"""
if not isinstance(var, Mapping):
return None
Expand All @@ -342,7 +340,7 @@ def _get_hgvs_tools(self):
self.hgvs_tools = HgvsTools(uta_conn)
return self.hgvs_tools

def _to_hgvs(self, vo, namespace="refseq"):
def _to_hgvs(self, vo, namespace="refseq"): # pylint: disable=too-many-locals
"""generates a *list* of HGVS expressions for VRS Allele.

If `namespace` is not None, returns HGVS strings for the
Expand Down Expand Up @@ -385,11 +383,11 @@ def ir_stype(a):
stype = stypes[0]

# build interval and edit depending on sequence type
if stype == "p":
if stype == "p": # pylint: disable=no-else-raise
raise ValueError("Only nucleic acid variation is currently supported")
# ival = hgvs.location.Interval(start=start, end=end)
# edit = hgvs.edit.AARefAlt(ref=None, alt=vo.state.sequence)
else: # pylint: disable=no-else-raise
else:
start, end = vo.location.interval.start.value, vo.location.interval.end.value
# ib: 0 1 2 3 4 5
# h: 1 2 3 4 5
Expand Down Expand Up @@ -421,7 +419,7 @@ def ir_stype(a):
if ns.startswith("GRC") and namespace is None:
continue

if not (any(a.startswith(pfx) for pfx in ("NM", "NP", "NC", "NG"))):
if not any(a.startswith(pfx) for pfx in ("NM", "NP", "NC", "NG")):
continue

var.ac = a
Expand All @@ -435,7 +433,7 @@ def ir_stype(a):

hgvs_exprs += [str(var)]
except hgvs.exceptions.HGVSDataNotAvailableError:
_logger.warning(f"No data found for accession {a}")
_logger.warning("No data found for accession %s", a)
korikuzma marked this conversation as resolved.
Show resolved Hide resolved

return list(set(hgvs_exprs))

Expand Down Expand Up @@ -493,6 +491,9 @@ def _is_valid_ref_seq(self, sequence_id: str, start_pos: int, end_pos: int,


def is_valid_allele(self, vo):
"""Ensure that `vo` is a valid VRS Allele with SequenceLocation and
LiteralSequenceExpression
"""
return (vo.type == "Allele"
and vo.location.type == "SequenceLocation"
and vo.state.type == "LiteralSequenceExpression")
Expand All @@ -515,7 +516,7 @@ def _post_process_imported_allele(self, allele):
allele.location.sequence_id = seq_id

if self.normalize:
allele = normalize(allele, self.data_proxy)
allele = do_normalize(allele, self.data_proxy)

if self.identify:
allele._id = ga4gh_identify(allele)
Expand Down Expand Up @@ -582,16 +583,16 @@ def _seq_id_mapper(self, ir):
"type": "SequenceInterval"
}
]
formats = ["hgvs", "gnomad", "beacon", "spdi", "vrs", None]
from_formats = ["hgvs", "gnomad", "beacon", "spdi", "vrs", None]

for e in expressions:
print(f"* {e}")
for f in formats:
for f in from_formats:
try:
o = tlr.translate_from(e, f)
r = o.type
vrs_obj = tlr.translate_from(e, f)
r = vrs_obj.type
except ValueError:
r = "-"
except Exception as ex:
except Exception as ex: # pylint: disable=broad-except
r = ex.__class__.__name__
print(f" {f}: {r}")
6 changes: 3 additions & 3 deletions src/ga4gh/vrs/extras/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,19 @@ def _format_time(timespan, precision=3):
value = int(leftover / length)
if value > 0:
leftover = leftover % length
time.append(u"%s%s" % (str(value), suffix))
time.append(f"{value}{suffix}")
if leftover < 1:
break
return " ".join(time)

units = [u"s", u"ms", u"us", u"ns"] # the save value
units = ["s", "ms", "us", "ns"] # the save value
scaling = [1, 1e3, 1e6, 1e9]

if timespan > 0.0:
order = min(-int(math.floor(math.log10(timespan)) // 3), 3)
else:
order = 3
return u"%.*g %s" % (precision, timespan * scaling[order], units[order])
return f"{timespan * scaling[order]:.{precision}g} {units[order]}"


def hex_to_base64url(s):
Expand Down
Loading