diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..23d23de0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: local + hooks: + - id: pylint + name: pylint + entry: pylint + language: system + types: [python] + require_serial: true + exclude: ^tests/ + args: + - src/ga4gh/core + - src/ga4gh/vrs + - --fail-under=10 \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 2de19cfc..784b35e8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -28,6 +28,18 @@ max-line-length=120 # Maximum number of lines in a module. max-module-lines=1000 +[MESSAGES CONTROL] + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=fixme,protected-access,too-few-public-methods [STRING] @@ -138,5 +150,5 @@ min-public-methods=2 # Exceptions that will emit a warning when being caught. Defaults to # "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +overgeneral-exceptions=builtins.BaseException, + builtins.Exception diff --git a/README.md b/README.md index 2cd8bfe1..8375fe18 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,14 @@ Fork the repo at https://github.com/ga4gh/vrs-python/ . $ cd vrs-python $ make devready +## Pre-commit + +We use [pre-commit](https://pre-commit.com/) to check code style. + +Before first commit, run: + + $ pre-commit install + ## Testing This package implements typical unit tests for ga4gh.core and ga4gh.vrs. This diff --git a/setup.cfg b/setup.cfg index 3c0d4e19..4de64f64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -97,6 +97,7 @@ dev = tox vcrpy yapf + pre-commit extras = psycopg2-binary biocommons.seqrepo>=0.5.1 diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index 0045157c..9dcfdba2 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -12,7 +12,7 @@ parse_ga4gh_identifier) from ._internal.jsonschema import (build_models, build_class_referable_attribute_map, is_pjs_class, is_pjs_instance, is_curie, is_identifiable, is_literal, pjs_copy) - +# pylint: disable=duplicate-code try: __version__ = get_distribution(__name__).version except DistributionNotFound: # pragma: nocover diff --git a/src/ga4gh/core/_internal/identifiers.py b/src/ga4gh/core/_internal/identifiers.py index 14b84b8e..1fd062b9 100644 --- a/src/ga4gh/core/_internal/identifiers.py +++ b/src/ga4gh/core/_internal/identifiers.py @@ -32,13 +32,14 @@ # Assume that ga4gh.yaml and vrs.yaml files are in the same directory for now schema_dir = os.environ.get("VRSATILE_SCHEMA_DIR", pkg_resources.resource_filename(__name__, "data/schemas/vrsatile")) -cfg = yaml.safe_load(open(schema_dir + "/merged.yaml")) -defs = cfg["definitions"] +with open(schema_dir + "/merged.yaml", "r", encoding="utf-8") as f: + cfg = yaml.safe_load(f) -type_prefix_map_default = dict() -for k,v in defs.items(): - if "ga4gh_prefix" in v: - type_prefix_map_default[k] = v["ga4gh_prefix"] +defs = cfg["definitions"] +type_prefix_map_default = {} +for key, val in defs.items(): + if "ga4gh_prefix" in val: + type_prefix_map_default[key] = val["ga4gh_prefix"] namespace = "ga4gh" curie_sep = ":" @@ -147,7 +148,7 @@ def ga4gh_serialize(vro): """ - def dictify(vro, enref=True): + def dictify(vro, enref=True): # pylint: disable=too-many-return-statements """recursively converts (any) object to dictionary prior to serialization diff --git a/src/ga4gh/core/_internal/jsonschema.py b/src/ga4gh/core/_internal/jsonschema.py index 5f299378..90f9fb08 100644 --- a/src/ga4gh/core/_internal/jsonschema.py +++ b/src/ga4gh/core/_internal/jsonschema.py @@ -27,7 +27,7 @@ def build_models(path, standardize_names=False): """load models from json schema at path""" - with open(path, "r") as yaml_file: + with open(path, "r", encoding="utf-8") as yaml_file: y = yaml.load(yaml_file, yaml.SafeLoader) builder = pjs.ObjectBuilder(pjs_filter(y)) models = builder.build_classes(standardize_names=standardize_names) diff --git a/src/ga4gh/vrs/dataproxy.py b/src/ga4gh/vrs/dataproxy.py index 48cdb56c..53db185b 100644 --- a/src/ga4gh/vrs/dataproxy.py +++ b/src/ga4gh/vrs/dataproxy.py @@ -152,7 +152,7 @@ def _get_sequence(self, identifier, start=None, end=None): url = self.base_url + f"sequence/{identifier}" _logger.info("Fetching %s", url) params = {"start": start, "end": end} - resp = requests.get(url, params=params) + resp = requests.get(url, params=params, timeout=5) if resp.status_code == 404: raise KeyError(identifier) resp.raise_for_status() @@ -161,7 +161,7 @@ def _get_sequence(self, identifier, start=None, end=None): def _get_metadata(self, identifier): url = self.base_url + f"metadata/{identifier}" _logger.info("Fetching %s", url) - resp = requests.get(url) + resp = requests.get(url, timeout=5) if resp.status_code == 404: raise KeyError(identifier) resp.raise_for_status() diff --git a/src/ga4gh/vrs/extras/localizer.py b/src/ga4gh/vrs/extras/localizer.py index 5fcd4033..9550e3d1 100644 --- a/src/ga4gh/vrs/extras/localizer.py +++ b/src/ga4gh/vrs/extras/localizer.py @@ -1,10 +1,6 @@ """convert named locations into SequenceLocations ("localize") by reference to external data - """ - -import copy - from bioutils.accessions import coerce_namespace from bioutils.assemblies import make_name_ac_map from bioutils.cytobands import get_cytoband_maps @@ -45,27 +41,25 @@ def __init__(self): self._ana_maps = {k: make_name_ac_map(k) for k in assy_name_to_map_name} - def localize_allele(self, allele): + def localize_allele(self, allele, assembly_name = "GRCh38"): + """Converts VRS Allele's named features to sequence location""" # copy input variant and replace location # N.B. deepcopy leads to recursion errors allele_sl = ga4gh.vrs.models.Variation(**allele.as_dict()) del allele_sl.id - allele_sl.location = self.localize(allele.location) + allele_sl.location = self.localize_named_feature(allele.location, assembly_name) return allele_sl def localize_named_feature(self, loc, assembly_name): - """converts named features to sequence locations - - """ - + """converts named features to sequence locations""" assert loc.type._value == "ChromosomeLocation", "Expected a ChromosomeLocation object" def _get_coords(m, cb): """return (start,end) of band `cb` in map `m`""" if cb is None: return None - return chr_cb_map[cb][0:2] + return m[cb][0:2] try: map_name = assy_name_to_map_name[assembly_name] @@ -82,14 +76,14 @@ def _get_coords(m, cb): coords = [] try: - coords += _get_coords(chr_cb_map, loc.interval.start) + coords += _get_coords(chr_cb_map, loc.start) except (KeyError, ValueError) as e: - raise ValueError(f"{loc.interval.start}: ChromosomeLocation not in" + raise ValueError(f"{loc.start}: ChromosomeLocation not in" " map for {assembly_name}, chr {loc.chr}") from e try: - coords += _get_coords(chr_cb_map, loc.interval.end) + coords += _get_coords(chr_cb_map, loc.end) except (KeyError, ValueError) as e: - raise ValueError(f"{loc.interval.end}: ChromosomeLocation not in" + raise ValueError(f"{loc.end}: ChromosomeLocation not in" " map for {assembly_name}, chr {loc.chr}") from e # the following works regardless of orientation of bands and number of bands @@ -98,19 +92,31 @@ def _get_coords(m, cb): try: ac = self._ana_maps[assembly_name][loc.chr] except KeyError as e: - raise ValueError(f"No accssion for {loc.chr} in assembly {assembly_name}") from e + raise ValueError(f"No accession for {loc.chr} in assembly {assembly_name}") from e return ga4gh.vrs.models.SequenceLocation( sequence_id=coerce_namespace(ac), - interval=ga4gh.vrs.models.SequenceInterval( - start=ga4gh.vrs.models.Number(value=start), - end=ga4gh.vrs.models.Number(value=end)) - ) - - - + start=ga4gh.vrs.models.Number(value=start), + end=ga4gh.vrs.models.Number(value=end) + ) if __name__ == "__main__": - cbl = ga4gh.vrs.models.ChromosomeLocation(chr="11", start="q22.3", end="q23.1") lr = Localizer() + allele_dict = { + "location": { + "chr":"19", + "end": "q13.32", + "start": "q13.32", + "species_id":"taxonomy:9606", + "type":"ChromosomeLocation" + }, + "state": { + "sequence": "T", + "type": "LiteralSequenceExpression" + }, + "type": "Allele" + } + allele_vo = ga4gh.vrs.models.Allele(**allele_dict) + allele_with_seq_loc = lr.localize_allele(allele_vo) + print("Allele with SequenceLocation:", allele_with_seq_loc.as_dict()) diff --git a/src/ga4gh/vrs/extras/translator.py b/src/ga4gh/vrs/extras/translator.py index c37cc346..be88d593 100644 --- a/src/ga4gh/vrs/extras/translator.py +++ b/src/ga4gh/vrs/extras/translator.py @@ -18,7 +18,7 @@ import hgvs.dataproviders.uta from ga4gh.core import ga4gh_identify -from ga4gh.vrs import models, normalize +from ga4gh.vrs import models, normalize as do_normalize from ga4gh.vrs.extras.decorators import lazy_property # this should be relocated from ga4gh.vrs.utils.hgvs_tools import HgvsTools @@ -42,7 +42,7 @@ class Translator: spdi_re = re.compile(r"(?P[^:]+):(?P\d+):(?P\w*):(?P\w*)") - def __init__(self, + def __init__(self, # pylint: disable=too-many-arguments data_proxy, default_assembly_name="GRCh38", translate_sequence_identifiers=True, @@ -72,7 +72,7 @@ def translate_from(self, var, fmt=None): raise ValueError(f"Unable to parse data as {fmt} variation") return o - for fmt, t in self.from_translators.items(): + for _, t in self.from_translators.items(): o = t(self, var) if o: return o @@ -192,7 +192,7 @@ def _from_hgvs(self, hgvs_expr): if not self.hgvs_re.match(hgvs_expr): return None - sv = self._hgvs_parser.parse_hgvs_variant(hgvs_expr) + sv = self._hgvs_parser.parse_hgvs_variant(hgvs_expr) # pylint: disable=no-member # prefix accession with namespace sequence_id = coerce_namespace(sv.ac) @@ -296,7 +296,7 @@ def _get_hgvs_tools(self): self.hgvs_tools = HgvsTools(uta_conn) return self.hgvs_tools - def _to_hgvs(self, vo, namespace="refseq"): + def _to_hgvs(self, vo, namespace="refseq"): # pylint: disable=too-many-locals """generates a *list* of HGVS expressions for VRS Allele. If `namespace` is not None, returns HGVS strings for the @@ -339,7 +339,7 @@ def ir_stype(a): stype = stypes[0] # build interval and edit depending on sequence type - if stype == "p": + if stype == "p": # pylint: disable=no-else-raise raise ValueError("Only nucleic acid variation is currently supported") # ival = hgvs.location.Interval(start=start, end=end) # edit = hgvs.edit.AARefAlt(ref=None, alt=vo.state.sequence) @@ -375,7 +375,7 @@ def ir_stype(a): if ns.startswith("GRC") and namespace is None: continue - if not (any(a.startswith(pfx) for pfx in ("NM", "NP", "NC", "NG"))): + if not any(a.startswith(pfx) for pfx in ("NM", "NP", "NC", "NG")): continue var.ac = a @@ -389,7 +389,7 @@ def ir_stype(a): hgvs_exprs += [str(var)] except hgvs.exceptions.HGVSDataNotAvailableError: - _logger.warning(f"No data found for accession {a}") + _logger.warning("No data found for accession %s", a) return list(set(hgvs_exprs)) @@ -425,6 +425,9 @@ def _to_spdi(self, vo, namespace="refseq"): def is_valid_allele(self, vo): + """Ensure that `vo` is a valid VRS Allele with SequenceLocation and + LiteralSequenceExpression + """ return (vo.type == "Allele" and vo.location.type == "SequenceLocation" and vo.state.type == "LiteralSequenceExpression") @@ -447,7 +450,7 @@ def _post_process_imported_allele(self, allele): allele.location.sequence_id = seq_id if self.normalize: - allele = normalize(allele, self.data_proxy) + allele = do_normalize(allele, self.data_proxy) if self.identify: allele.id = ga4gh_identify(allele) @@ -511,16 +514,16 @@ def _seq_id_mapper(self, ir): "start": {"value": 21, "type": "Number"} } ] - formats = ["hgvs", "gnomad", "beacon", "spdi", "vrs", None] + from_formats = ["hgvs", "gnomad", "beacon", "spdi", "vrs", None] for e in expressions: print(f"* {e}") - for f in formats: + for f in from_formats: try: - o = tlr.translate_from(e, f) - r = o.type + vrs_obj = tlr.translate_from(e, f) + r = vrs_obj.type except ValueError: r = "-" - except Exception as ex: + except Exception as ex: # pylint: disable=broad-except r = ex.__class__.__name__ print(f" {f}: {r}") diff --git a/src/ga4gh/vrs/extras/utils.py b/src/ga4gh/vrs/extras/utils.py index 9c96caca..afcb306b 100644 --- a/src/ga4gh/vrs/extras/utils.py +++ b/src/ga4gh/vrs/extras/utils.py @@ -35,19 +35,19 @@ def _format_time(timespan, precision=3): value = int(leftover / length) if value > 0: leftover = leftover % length - time.append(u"%s%s" % (str(value), suffix)) + time.append(f"{value}{suffix}") if leftover < 1: break return " ".join(time) - units = [u"s", u"ms", u"us", u"ns"] # the save value + units = ["s", "ms", "us", "ns"] # the save value scaling = [1, 1e3, 1e6, 1e9] if timespan > 0.0: order = min(-int(math.floor(math.log10(timespan)) // 3), 3) else: order = 3 - return u"%.*g %s" % (precision, timespan * scaling[order], units[order]) + return f"{timespan * scaling[order]:.{precision}g} {units[order]}" def hex_to_base64url(s): diff --git a/src/ga4gh/vrs/extras/variation_normalizer_rest_dp.py b/src/ga4gh/vrs/extras/variation_normalizer_rest_dp.py index 63edba6e..7c281eb2 100644 --- a/src/ga4gh/vrs/extras/variation_normalizer_rest_dp.py +++ b/src/ga4gh/vrs/extras/variation_normalizer_rest_dp.py @@ -1,9 +1,10 @@ +"""Module for Variation Normalizer REST Data Proxy. This is needed for the notebooks, +so that users do not have to manually install and setup UTA. +""" import requests class VariationNormalizerRESTDataProxy: - """ - Rest data proxy for Variation Normalizer API - """ + """Rest data proxy for Variation Normalizer API""" def __init__(self) -> None: """ Initialize class with the API URL @@ -11,23 +12,22 @@ def __init__(self) -> None: self.api = "https://normalize.cancervariants.org/variation" def to_hgvs(self, vo, namespace="refseq"): - """ - tranlsate vrs allele object (vo) to hgvs format + """Translate vrs allele object (vo) to hgvs format Use this method if you don't have UTA installed locally or are unable to reach the public UTA database due to port issues. """ vo = vo.as_dict() - data = dict( - variation = vo, - namespace = namespace - ) + data = { + "variation": vo, + "namespace": namespace + } r = requests.post( url = f"{self.api}/vrs_allele_to_hgvs", json=data, - headers={ "Content-Type": "application/json; charset=utf-8" } + headers={ "Content-Type": "application/json; charset=utf-8" }, + timeout=5 ) - if r.status_code == 200: - return r.json().get("variations", []) - else: + if r.status_code != 200: raise requests.HTTPError(f"Variation normalizer returned the status code: {r.status_code}.") + return r.json().get("variations", []) diff --git a/src/ga4gh/vrs/extras/vcf_annotation.py b/src/ga4gh/vrs/extras/vcf_annotation.py index 464477dc..860272af 100644 --- a/src/ga4gh/vrs/extras/vcf_annotation.py +++ b/src/ga4gh/vrs/extras/vcf_annotation.py @@ -46,12 +46,14 @@ def annotate(self, inputfile, outputfile, vrsfile): param: str outputfile The path and filename for the output VCF file param: str vrsfile The path and filename for the output VRS object file """ - INFO_FIELD_ID = "VRS_Allele" + INFO_FIELD_ID = "VRS_Allele" # pylint: disable=invalid-name vrs_data = {} - vcf_in = pysam.VariantFile(filename=inputfile) + vcf_in = pysam.VariantFile(filename=inputfile) # pylint: disable=no-member vcf_in.header.info.add(INFO_FIELD_ID, "1", "String", "vrs") - vcf_out = pysam.VariantFile(outputfile, "w", header=vcf_in.header) - vrs_out = open(vrsfile, "wb") # For sending VRS data to the pickle file + vcf_out = pysam.VariantFile(outputfile, "w", header=vcf_in.header) # pylint: disable=no-member + + # For sending VRS data to the pickle file + vrs_out = open(vrsfile, "wb") # pylint: disable=consider-using-with for record in vcf_in: ld = self._record_digests(record, vrs_data) @@ -111,12 +113,12 @@ def parse_args(argv): options = parse_args(sys.argv[1:]) print(f"These are the options that you have selected: {options}\n") data_proxy = SeqRepoDataProxy(SeqRepo("/usr/local/share/seqrepo/latest")) - tlr = Translator(data_proxy) - vcf_annotator = VCFAnnotator(tlr) + translator = Translator(data_proxy) + vcf_annotator = VCFAnnotator(translator) vcf_annotator.annotate(options.VCF_IN, options.out, options.vrs_file) end_time = time.time() total_time = (float(end_time) - float(start_time)) - total_time_minutes = (total_time / 60) + total_time_minutes = total_time / 60 print(f"This program took {total_time} seconds to run.") print(f"This program took {total_time_minutes} minutes to run.") diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index e5f3c67a..dd4def17 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -51,12 +51,12 @@ def _normalize_allele(allele, data_proxy): return new_allele -def _normalize_haplotype(o, data_proxy=None): +def _normalize_haplotype(o, data_proxy=None): # pylint: disable=unused-argument o.members = sorted(o.members, key=ga4gh_digest) return o -def _normalize_variationset(o, data_proxy=None): +def _normalize_variationset(o, data_proxy=None): # pylint: disable=unused-argument o.members = sorted(o.members, key=ga4gh_digest) return o @@ -105,13 +105,13 @@ def normalize(vo, data_proxy=None): }, "type": "Allele" } - allele = models.Allele(**allele_dict) + allele1 = models.Allele(**allele_dict) - allele2 = normalize(allele, dp) + allele2 = normalize(allele1, dp) - allele.state.sequence = "C" - allele3 = normalize(allele, dp) + allele1.state.sequence = "C" + allele3 = normalize(allele1, dp) - allele.location.interval.end.value = 44908823 - allele.state.sequence = "" - allele4 = normalize(allele, dp) + allele1.location.interval.end.value = 44908823 + allele1.state.sequence = "" + allele4 = normalize(allele1, dp)