v0.5.0 (#78)

* draft streaming with generators * set up effect types * profiling improvements * fix output * check for duplicates * add liftover * update dependencies and set up pre-commit * complain when linting fails * fix linting * support wide files * add log * fix tests and liftover * fix test * sqlite support and add log data * fix tests * fix tests * fixes to make old and new output consistent * update tests * drop parallel gzip and --threads * create ScoreVariant and EffectType classes * review comments * add type hints * remove coordinates from mandatory fields * fix old scoring files * check effect alleles and compelx scoring files * don't access __annotations__ directly * remove logger * warn about complex files and variant mismatch * refactor scorevariant from userdict to class with __slots__ * fix __repr__ and type hints * add pyarrow support * add license data to log * add custom exceptions * add custom exit code * move class definitions * rename * update effect allele class * tidy up docstring * add docstrings to pytest * fix pyproject * Make sure that IID isn't converted to numeric during aggreation Signed-off-by: smlmbrt <sam.a.lambert@gmail.com> * bump minor version * dynamically set is_snp * remove samplesheet package * delete samplesheet tests * fix liftover * set up local venv * fix liftover test * improve comment --------- Signed-off-by: smlmbrt <sam.a.lambert@gmail.com> Co-authored-by: smlmbrt <sam.a.lambert@gmail.com>
PGScatalog · Feb 19, 2024 · c672be7 · c672be7
1 parent 6da7eb0
commit c672be7
Show file tree

Hide file tree

Showing 32 changed files with 4,200 additions and 1,182 deletions.
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,5 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
+.idea/
+.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.1.3
+  hooks:
+    - id: ruff
+      args: [--fix, --exit-non-zero-on-fix]  
+    - id: ruff-format
diff --git a/conftest.py b/conftest.py
@@ -1,33 +1,63 @@
 import glob
 import importlib.resources
 import os
-import pathlib
 import shutil
 from unittest.mock import patch
 
-import pandas as pd
 import polars as pl
 import pytest
 import requests as req
 
 from pgscatalog_utils.download.download_scorefile import download_scorefile
 from pgscatalog_utils.match.preprocess import complement_valid_alleles
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
+from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
+
+from tests.data import combine
 
 pl.toggle_string_cache(True)
 
 
 @pytest.fixture(scope="session")
 def pgs_accessions():
-    return ['PGS001229', 'PGS000922']
+    return ["PGS001229", "PGS000922"]
+
+
+@pytest.fixture(scope="session")
+def mini_score_path(tmp_path_factory):
+    path = importlib.resources.files(combine) / "PGS001229_22.txt"
+    return str(path)
+
+
+@pytest.fixture(scope="session")
+def mini_scorefile(mini_score_path, tmp_path_factory):
+    # The mini scorefile overlaps well with cineca synthetic subset
+    out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
+    args: list[str] = (
+        ["combine_scorefiles", "-t", "GRCh37", "-s"]
+        + [mini_score_path]
+        + ["-o", str(out_path.resolve())]
+    )
+
+    with patch("sys.argv", args):
+        combine_scorefiles()
+
+    return str(out_path.resolve())
 
 
 @pytest.fixture(scope="session")
 def scorefiles(tmp_path_factory, pgs_accessions):
     fn = tmp_path_factory.mktemp("scorefiles")
-    args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions
-
-    with patch('sys.argv', args):
+    args: list[str] = [
+        "download_scorefiles",
+        "-b",
+        "GRCh37",
+        "-o",
+        str(fn.resolve()),
+        "-i",
+    ] + pgs_accessions
+
+    with patch("sys.argv", args):
         download_scorefile()
 
     return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
@@ -37,138 +67,135 @@ def scorefiles(tmp_path_factory, pgs_accessions):
 def target_path(tmp_path_factory):
     try:
         bim = req.get(
-            'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim',
-            timeout=5)
+            "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim",
+            timeout=5,
+        )
     except (req.exceptions.ConnectionError, req.Timeout):
         bim = []
 
     if not bim:
         pytest.skip("Couldn't get test data from network")
     else:
         fn = tmp_path_factory.mktemp("target") / "data.bim"
-        with open(fn, 'wb') as f:
+        with open(fn, "wb") as f:
             f.write(bim.content)
 
         return str(fn.resolve())
 
 
-@pytest.fixture(scope="session")
-def mini_score_path(tmp_path_factory):
-    try:
-        score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt',
-                        timeout=5)
-    except (req.exceptions.ConnectionError, req.Timeout):
-        score = []
-
-    if not score:
-        pytest.skip("Couldn't get test data from network")
-    else:
-        fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt"
-        with open(fn, 'wb') as f:
-            f.write(score.content)
-
-        return str(fn.resolve())
-
-
-@pytest.fixture(scope="session")
-def mini_scorefile(mini_score_path, tmp_path_factory):
-    # The mini scorefile overlaps well with cineca synthetic subset
-    out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
-    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
-
-    return str(out_path.resolve())
-
-
-@pytest.fixture(scope="session")
-def combined_scorefile(scorefiles, tmp_path_factory):
-    # The combined scorefile overlaps poorly with cineca synthetic subset
-    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
-    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
-
-    return str(out_path.resolve())
-
-
 @pytest.fixture(scope="session")
 def chain_files(tmp_path_factory):
-    chain_dir = tmp_path_factory.mktemp('chain_dir')
+    chain_dir = tmp_path_factory.mktemp("chain_dir")
 
     shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir)
     shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir)
-
-    return str(chain_dir.resolve())
-
-
-@pytest.fixture(scope="session")
-def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
-    out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
-                                                                          'GRCh38',
-                                                                          '-m', '0.8'] + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
 
-    return str(out_path.resolve())
+    return str(chain_dir.resolve())
 
 
 @pytest.fixture(scope="session")
 def hg38_coords():
-    d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
-    df = pd.DataFrame(d)
-    df['accession'] = 'dummy'
-    df['genome_build'] = 'GRCh38'
-    return df
+    rs11903757 = ScoreVariant(
+        **{
+            "rsid": "rs11903757",
+            "chr_name": "2",
+            "chr_position": 191722478,
+            "row_nr": 0,
+            "effect_weight": 1,
+            "accession": "test",
+            "effect_allele": "A",
+        }
+    )
+    rs6061231 = ScoreVariant(
+        **{
+            "rsid": "rs6061231",
+            "chr_name": "20",
+            "chr_position": 62381861,
+            "row_nr": 1,
+            "effect_weight": 1,
+            "accession": "test",
+            "effect_allele": "A",
+        }
+    )
+    return (x for x in [rs11903757, rs6061231])
 
 
 @pytest.fixture(scope="session")
-def hg19_coords(hg38_coords):
+def hg19_coords():
     # hg38_coords in GRCh37, from dbSNP
-    d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]}
-    return pd.DataFrame(d)
+    rs11903757 = ScoreVariant(
+        **{
+            "rsid": "rs11903757",
+            "chr_name": "2",
+            "chr_position": 192587204,
+            "row_nr": 0,
+            "effect_weight": 1,
+            "accession": "test",
+            "effect_allele": "A",
+        }
+    )
+    rs6061231 = ScoreVariant(
+        **{
+            "rsid": "rs6061231",
+            "chr_name": "20",
+            "chr_position": 60956917,
+            "row_nr": 1,
+            "effect_weight": 1,
+            "accession": "test",
+            "effect_allele": "A",
+        }
+    )
+    return (x for x in [rs11903757, rs6061231])
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_flipped_scorefile(small_scorefile):
     # simulate a scorefile on the wrong strand
-    return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
-            .drop(['effect_allele', 'other_allele'])
-            .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
-            .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
+    return (
+        complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"])
+        .drop(["effect_allele", "other_allele"])
+        .rename(
+            {"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"}
+        )
+        .pipe(complement_valid_alleles, ["effect_allele", "other_allele"])
+    )
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_target():
-    return pl.DataFrame({"#CHROM": [1, 2, 3],
-                         "POS": [1, 2, 3],
-                         "REF": ["A", "T", "T"],
-                         "ALT": ["C", "A", "G"],
-                         "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
-                         "is_multiallelic": [False, False, False]})
+    return pl.DataFrame(
+        {
+            "#CHROM": [1, 2, 3],
+            "POS": [1, 2, 3],
+            "REF": ["A", "T", "T"],
+            "ALT": ["C", "A", "G"],
+            "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
+            "is_multiallelic": [False, False, False],
+        }
+    )
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_scorefile():
-    df = pl.DataFrame({"accession": ["test", "test", "test"],
-                       "row_nr": [1, 2, 3],
-                       "chr_name": [1, 2, 3],
-                       "chr_position": [1, 2, 3],
-                       "effect_allele": ["A", "A", "G"],
-                       "other_allele": ["C", "T", "T"],
-                       "effect_weight": [1, 2, 3],
-                       "effect_type": ["additive", "additive", "additive"]})
+    df = pl.DataFrame(
+        {
+            "accession": ["test", "test", "test"],
+            "row_nr": [1, 2, 3],
+            "chr_name": [1, 2, 3],
+            "chr_position": [1, 2, 3],
+            "effect_allele": ["A", "A", "G"],
+            "other_allele": ["C", "T", "T"],
+            "effect_weight": [1, 2, 3],
+            "effect_type": ["additive", "additive", "additive"],
+        }
+    )
 
     return complement_valid_alleles(df, ["effect_allele", "other_allele"])
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_scorefile_no_oa(small_scorefile):
-    return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
+    return small_scorefile.with_column(pl.lit(None).alias("other_allele"))
 
 
 def _get_timeout(url):

diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py
@@ -33,7 +33,7 @@ def aggregate(scorefiles: list[str]):
     for i, path in enumerate(scorefiles):
         logger.debug(f"Reading {path}")
         # pandas can automatically detect zst compression, neat!
-        df = (pd.read_table(path)
+        df = (pd.read_table(path, converters={"#IID": str}, header=0)
               .assign(sampleset=path.split('_')[0])
               .set_index(['sampleset', '#IID']))
 

diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py
@@ -1,6 +1,25 @@
-from enum import Enum, auto
+from enum import Enum
 
 
 class GenomeBuild(Enum):
-    GRCh37 = auto()
-    GRCh38 = auto()
+    GRCh37 = "GRCh37"
+    GRCh38 = "GRCh38"
+    # just included to handle older files, incompatible unless harmonised:
+    NCBI36 = "NCBI36"  # ew
+
+    def __str__(self):
+        return str(self.value)
+
+    @classmethod
+    def from_string(cls, build):
+        match build:
+            case "GRCh37" | "hg19":
+                return cls(GenomeBuild.GRCh37)
+            case "GRCh38" | "hg38":
+                return cls(GenomeBuild.GRCh38)
+            case "NR":
+                return None
+            case "NCBI36" | "hg18":
+                return cls(GenomeBuild.NCBI36)
+            case _:
+                raise Exception(f"Can't match {build=}")