From 4e65955cc6755f006f1708829f5522950cfe872a Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 29 Oct 2024 15:04:06 -0400 Subject: [PATCH 1/2] cicd: fix action caching key (#455) Use an existing file that dictates dependency changes, instead of an old (unused) file --- .github/workflows/python-cqa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-cqa.yml b/.github/workflows/python-cqa.yml index ac04a7a6..02a2b724 100644 --- a/.github/workflows/python-cqa.yml +++ b/.github/workflows/python-cqa.yml @@ -24,7 +24,7 @@ jobs: - uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-pip- - name: Set up Python ${{ matrix.python-version }} From 213b3aa0dfc6e45f25cead332c48a8b719adc523 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Wed, 30 Oct 2024 08:03:40 -0400 Subject: [PATCH 2/2] cicd: add basic precommit hooks (#454) * Add precommit + some minimally controversial hooks. Add a note in README. * Run hooks over existing codebase. There's a few instances of trailing whitespace and incorrect line endings. * Run checks in CI --- .github/workflows/python-cqa.yml | 9 ++++++- .gitignore | 2 +- .pre-commit-config.yaml | 14 +++++++++++ Makefile | 3 ++- README.md | 9 ++++++- docs/setup_help/m1_mac_setup.md | 2 +- docs/setup_help/uta_installation.md | 6 ++--- notebooks/getting_started/README.md | 5 ---- pyproject.toml | 1 + src/ga4gh/vrs/models.py | 4 ++-- src/ga4gh/vrs/utils/hgvs_tools.py | 35 ++++++++++++++-------------- submodules/README.md | 1 - tests/extras/data/test_vcf_input.vcf | 2 +- 13 files changed, 58 insertions(+), 35 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/python-cqa.yml b/.github/workflows/python-cqa.yml index 02a2b724..ff0e868d 100644 --- a/.github/workflows/python-cqa.yml +++ b/.github/workflows/python-cqa.yml @@ -24,7 +24,7 @@ jobs: - uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} + key: ${{ runner.os }}-pip-test-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-pip- - name: Set up Python ${{ matrix.python-version }} @@ -40,3 +40,10 @@ jobs: - name: Test with pytest run: | python -m pytest + + precommit_hooks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - uses: pre-commit/action@v3.0.1 diff --git a/.gitignore b/.gitignore index de6e5f2e..a6775940 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ lint uta_*.pgd.gz .vscode *.log -.idea \ No newline at end of file +.idea diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..6dbca4d4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-added-large-files + - id: detect-private-key + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-merge-conflict + - id: detect-aws-credentials + args: [ --allow-missing-credentials ] + - id: mixed-line-ending + args: [ --fix=lf ] +minimum_pre_commit_version: 4.0.1 diff --git a/Makefile b/Makefile index 5d9c5ec0..e7130653 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,8 @@ venv/%: #=> develop: install package in develop mode .PHONY: develop setup develop setup: - pip install -e .[dev,extras,notebooks] + pip install -e .[dev,extras,notebooks]; \ + pre-commit install #=> devready: create venv, install prerequisites, install pkg in develop mode .PHONY: devready diff --git a/README.md b/README.md index 51c39921..3a71c4b2 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,7 @@ This section is intended for developers who contribute to VRS-Python. ### Installing for development -Fork the repo at . +Fork the repo at and initialize a development environment. ```shell git clone --recurse-submodules git@github.com:YOUR_GITHUB_ID/vrs-python.git @@ -201,6 +201,13 @@ make devready source venv/3.12/bin/activate ``` +This setup includes [pre-commit hooks](https://pre-commit.com/). If you create a virtual environment manually, be sure to install the hooks yourself; otherwise, commits may fail during [CI/CD checks](https://github.com/ga4gh/vrs-python/actions/workflows/python-cqa.yml): + +```shell +source venv/3.12/bin/activate +pre-commit install +``` + If you already cloned the repo, but forgot to include `--recurse-submodules` you can run: ```shell diff --git a/docs/setup_help/m1_mac_setup.md b/docs/setup_help/m1_mac_setup.md index ae36f745..0b354cb6 100644 --- a/docs/setup_help/m1_mac_setup.md +++ b/docs/setup_help/m1_mac_setup.md @@ -46,4 +46,4 @@ export PKG_CONFIG_PATH="/opt/homebrew/opt/openssl@1.1/lib/pkgconfig:/opt/homebre 14. Run the make devready command: 1. `make devready` 15. Run the make test command: - 1. `make test` \ No newline at end of file + 1. `make test` diff --git a/docs/setup_help/uta_installation.md b/docs/setup_help/uta_installation.md index 27b89c1e..f0a689da 100644 --- a/docs/setup_help/uta_installation.md +++ b/docs/setup_help/uta_installation.md @@ -8,7 +8,7 @@ 4. Create roles for the application, give login and CREATEDB permissions: 1. `CREATE ROLE uta_admin WITH LOGIN CREATEDB;` 2. `CREATE ROLE anonymous WITH LOGIN CREATEDB;` -5. List the users using the command +5. List the users using the command 1. `\du` 6. Create the UTA Database object 1. `CREATE DATABASE uta;` @@ -16,7 +16,7 @@ 1. `GRANT ALL PRIVILEGES ON DATABASE uta TO uta_admin;` 8. Exit postgres 1. `\q` -9. Download the UTA database and place it in the uta database object that you created before (**This step takes around 5 hours**). +9. Download the UTA database and place it in the uta database object that you created before (**This step takes around 5 hours**). 1. `export UTA_VERSION=uta_20210129.pgd.gz\ncurl -O http://dl.biocommons.org/uta/$UTA_VERSION\ngzip -cdq ${UTA_VERSION} | psql -h localhost -U uta_admin --echo-errors --single-transaction -v ON_ERROR_STOP=1 -d uta -p 5432` 10. Set your UTA path 1. `export UTA_DB_URL=postgresql://uta_admin@localhost:5432/uta/uta_20210129` @@ -30,4 +30,4 @@ gzip -cdq ${UTA_VERSION} | grep -v "^REFRESH MATERIALIZED VIEW" | psql -h localh 1. `REFRESH MATERIALIZED VIEW uta_20210129.exon_set_exons_fp_mv;` 2. `REFRESH MATERIALIZED VIEW uta_20210129.tx_exon_set_summary_mv;` 3. `REFRESH MATERIALIZED VIEW uta_20210129.tx_def_summary_mv;` - 4. `REFRESH MATERIALIZED VIEW uta_20210129.tx_similarity_mv;` #**This step will take 5 or more hours** \ No newline at end of file + 4. `REFRESH MATERIALIZED VIEW uta_20210129.tx_similarity_mv;` #**This step will take 5 or more hours** diff --git a/notebooks/getting_started/README.md b/notebooks/getting_started/README.md index bd766616..74e7cd47 100644 --- a/notebooks/getting_started/README.md +++ b/notebooks/getting_started/README.md @@ -42,8 +42,3 @@ we show how to transform basic variants to VRS, and in some cases, back to the o The final notebook of this series, [Exploring the CNV Translator](5_Exploring_the_CnvTranslator.ipynb) details transformations of various forms of copy number variation to their VRS representations. - - - - - diff --git a/pyproject.toml b/pyproject.toml index 785fe401..8b68fa65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ "vcrpy", "pyyaml", # style + "pre-commit>=4.0.1", "pylint", "yapf", # docs diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index 70a85c80..fbfc02ee 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -703,7 +703,7 @@ class ga4gh(_ValueObject.ga4gh): 'component', 'orientation', 'type' - ] + ] class DerivativeMolecule(_VariationBase): """The "Derivative Molecule" data class is a structure for describing a derivate @@ -716,7 +716,7 @@ class DerivativeMolecule(_VariationBase): description="The traversal block components that make up the derivative molecule.", min_length=2 ) - circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).") + circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = "DM" diff --git a/src/ga4gh/vrs/utils/hgvs_tools.py b/src/ga4gh/vrs/utils/hgvs_tools.py index 85cee520..4857af88 100644 --- a/src/ga4gh/vrs/utils/hgvs_tools.py +++ b/src/ga4gh/vrs/utils/hgvs_tools.py @@ -33,7 +33,7 @@ def __init__(self,data_proxy: _DataProxy = None): self.variant_mapper = hgvs.variantmapper.VariantMapper(self.uta_conn) self.data_proxy = data_proxy - + def close(self): # TODO These should only be closed if they are owned by this instance @@ -57,7 +57,7 @@ def parse(self, hgvs_str): if not self.hgvs_re.match(hgvs_str): return None return self.parser.parse_hgvs_variant(hgvs_str) - + def is_intronic(self, sv: hgvs.sequencevariant.SequenceVariant): """ Checks if the given SequenceVariant is intronic. @@ -71,13 +71,13 @@ def is_intronic(self, sv: hgvs.sequencevariant.SequenceVariant): if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval): return (sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic) return False - + def get_edit_type(self, sv: hgvs.sequencevariant.SequenceVariant): if sv is None or sv.posedit is None or sv.posedit.edit is None: return None return sv.posedit.edit.type - + def get_position_and_state(self, sv: hgvs.sequencevariant.SequenceVariant): """ Get the details of a sequence variant. @@ -150,14 +150,14 @@ def extract_allele_values(self, hgvs_expr: str): sv = self.parse(hgvs_expr) if not sv: return None - + if self.is_intronic(sv): raise ValueError("Intronic HGVS variants are not supported") refget_accession = self.data_proxy.derive_refget_accession(sv.ac) if not refget_accession: return None - + # translate coding coordinates to positional coordinates, if necessary if sv.type == "c": sv = self.c_to_n(sv) @@ -166,7 +166,7 @@ def extract_allele_values(self, hgvs_expr: str): retval = {"refget_accession": refget_accession, "start": start, "end": end, "literal_sequence": state} return retval - + def from_allele(self, vo, namespace=None): """generates a *list* of HGVS expressions for VRS Allele. @@ -188,15 +188,15 @@ def from_allele(self, vo, namespace=None): if vo is None: return [] - if not isinstance(vo, models.Allele): + if not isinstance(vo, models.Allele): raise ValueError("VRS object must be an Allele") if vo.location is None: raise ValueError("VRS allele must have a location") - + refget_accession = vo.location.get_refget_accession() if refget_accession is None: raise ValueError("VRS allele location must have a sequence reference") - + sequence = f"ga4gh:{refget_accession}" aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace) @@ -218,9 +218,9 @@ def from_allele(self, vo, namespace=None): # create the hgvs expression object var = self._to_sequence_variant(vo, sequence_type, sequence, accession) hgvs_exprs += [str(var)] - + return list(set(hgvs_exprs)) - + def _to_sequence_variant(self, vo, sequence_type, sequence, accession): """Creates a SequenceVariant object from an Allele object.""" # build interval and edit depending on sequence type @@ -253,24 +253,24 @@ def _to_sequence_variant(self, vo, sequence_type, sequence, accession): # this will subsequently be converted back to `c.` after hgvs normalization type='n' if sequence_type == 'c' else sequence_type, posedit=posedit) - + try: # if the namespace is GRC, can't normalize, since hgvs can't deal with it parsed = self.parse(str(var)) var = self.normalize(parsed) - + # if sequence_type is coding, convert from "n." to "c." before continuing if sequence_type == "c": var = self.n_to_c(var) - + except hgvs.exceptions.HGVSDataNotAvailableError: _logger.warning(f"No data found for accession {accession}") - + return var def normalize(self, hgvs): return self.normalizer.normalize(hgvs) - + def n_to_c(self, hgvs): return self.variant_mapper.n_to_c(hgvs) @@ -312,4 +312,3 @@ def c_to_n(self, hgvs): hgvs_expr = hgvsTools.from_allele(vrs_allele, namespace="refseq") print(hgvs_expr) - diff --git a/submodules/README.md b/submodules/README.md index c09dc6f3..b8cc9428 100644 --- a/submodules/README.md +++ b/submodules/README.md @@ -1,4 +1,3 @@ This directory contains submodules required by vrs-python. If you don't see a vrs directory here, please reread instructions at https://github.com/ga4gh/vrs-python#installing-for-development. - diff --git a/tests/extras/data/test_vcf_input.vcf b/tests/extras/data/test_vcf_input.vcf index 5f972885..a6444fdc 100644 --- a/tests/extras/data/test_vcf_input.vcf +++ b/tests/extras/data/test_vcf_input.vcf @@ -235,4 +235,4 @@ chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,So chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 -chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 \ No newline at end of file +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99