diff --git a/CHANGELOG.md b/CHANGELOG.md index f2108ca4..55e2f5de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] +## [0.11.1] - 2022-01-06 + +### Added +- `quiet` flag for `genomepy.Annotation` +- `genomepy -v` flag + +### Changed +- `genomepy.Annotation` returns a `FileNotFoundError` instead of a `ValueError` where appropriate. +- `download_assembly_report` refactored. Now downloads the report for the exact same assembly accession (and not the nearest NCBI assembly). +- broader unit tests for UCSC assembly accession scraping + + +### Fixed +- inconsistent behaviour with assembly reports (#193 + #194) + ## [0.11.0] - 2021-11-18 ### Added @@ -368,6 +383,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `-r` and `--match/--no-match` option to select sequences by regex. [Unreleased]: https://github.com/vanheeringen-lab/genomepy/compare/master...develop +[0.11.1]: https://github.com/vanheeringen-lab/genomepy/compare/0.11.0...0.11.1 [0.11.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.10.0...0.11.0 [0.10.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.9.3...0.10.0 [0.9.3]: https://github.com/vanheeringen-lab/genomepy/compare/0.9.2...0.9.3 diff --git a/docs/release_checklist.md b/docs/release_checklist.md index 9b200d79..83b26bbc 100644 --- a/docs/release_checklist.md +++ b/docs/release_checklist.md @@ -4,9 +4,9 @@ 1. Make sure all tests pass. - ```shell - pytest -vv --disable-pytest-warnings - ``` + ```shell + pytest -vv --disable-pytest-warnings + ``` 2. Create release candidate with `git flow`: @@ -58,13 +58,13 @@ ```shell git flow release finish ${new_version} - ``` + ``` 7. Push everything to github, including tags: - ```shell - git push --follow-tags origin develop - ``` + ```shell + git push --follow-tags origin develop + ``` 8. Pull into master diff --git a/genomepy/__about__.py b/genomepy/__about__.py index 5238339a..df01b7fc 100644 --- a/genomepy/__about__.py +++ b/genomepy/__about__.py @@ -1,3 +1,3 @@ """Metadata""" -__version__ = "0.11.0" +__version__ = "0.11.1" __author__ = "Simon van Heeringen, Siebren Frölich, Maarten van der Sande" diff --git a/genomepy/annotation/__init__.py b/genomepy/annotation/__init__.py index db26603a..54e3e1ee 100644 --- a/genomepy/annotation/__init__.py +++ b/genomepy/annotation/__init__.py @@ -28,6 +28,8 @@ class Annotation: Genome name/directory/fasta or gene annotation BED/GTF file. genomes_dir : str, optional Genomes installation directory. + quiet : bool, optional + Silence init warnings Returns ------- @@ -52,7 +54,7 @@ class Annotation: annotation_contigs: list = None "Contigs found in the gene annotation BED" - def __init__(self, name: str, genomes_dir: str = None): + def __init__(self, name: str, genomes_dir: str = None, quiet: bool = False): # name and directory n, g = _get_name_and_dir(name, genomes_dir) self.name = n @@ -65,12 +67,12 @@ def __init__(self, name: str, genomes_dir: str = None): suffixes = Path(fname).suffixes[-2:] b = fname if not (".bed" in suffixes or ".BED" in suffixes): - b = _get_file(self.genome_dir, f"{self.name}.annotation.bed") + b = _get_file(self.genome_dir, f"{self.name}.annotation.bed", not quiet) self.annotation_bed_file = b "path to the gene annotation BED file" g = fname if not (".gtf" in suffixes or ".GTF" in suffixes): - g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf") + g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf", not quiet) self.annotation_gtf_file = g "path to the gene annotation GTF file" @@ -385,7 +387,7 @@ def _get_name_and_dir(name, genomes_dir=None): elif name in os.listdir(genomes_dir): genome_dir = os.path.join(genomes_dir, name) else: - raise ValueError(f"Could not find {name}") + raise FileNotFoundError(f"Could not find {name}") return name, genome_dir diff --git a/genomepy/annotation/mygene.py b/genomepy/annotation/mygene.py index ac395a7e..4e1c9c8e 100644 --- a/genomepy/annotation/mygene.py +++ b/genomepy/annotation/mygene.py @@ -161,55 +161,6 @@ def _parse_mygene_input(field, product=None): return field, product -# def ensembl_genome_info(self) -> Optional[Tuple[str, str, str]]: -# """ -# Return Ensembl genome information for this genome. -# -# Requires accession numbers to match (excluding patch numbers) -# -# Returns -# ------- -# (str, str, str) -# Ensembl name, accession, taxonomy_id -# """ -# _check_property(self.readme_file, "README.txt") -# -# metadata, _ = read_readme(self.readme_file) -# if metadata.get("provider") == "Ensembl": -# return metadata["name"], metadata["assembly_accession"], metadata["tax_id"] -# -# if metadata.get("assembly_accession", "na") == "na": -# logger.warning("Cannot find a matching genome without an assembly accession.") -# return -# -# asm_acc = metadata["assembly_accession"] -# search_result = nearest_assembly(asm_acc, "Ensembl") -# if search_result is None: -# return -# -# return safe(search_result[0]), search_result[2], search_result[3] -# -# -# def _query_mygene( -# self, -# query: Iterable[str], -# field: str = "genomic_pos", -# ) -> pd.DataFrame: -# # mygene.info only queries the most recent version of the Ensembl database -# # We can only safely continue if the local genome matched the Ensembl genome. -# # Even if the local genome was installed via Ensembl, we still need to check -# # if it is the same version -# ensembl_info = ensembl_genome_info(self) -# if ensembl_info is None: -# return pd.DataFrame() -# -# tax_id = ensembl_info[2] -# if not str(tax_id).isdigit(): -# raise ValueError("No taxomoy ID found") -# -# return query_mygene(query, tax_id, field) - - def _filter_query(query: pd.DataFrame) -> pd.DataFrame: """per queried gene, keep the best matching, non-NaN, mapping""" if "notfound" in query: diff --git a/genomepy/cli.py b/genomepy/cli.py index 21d265c5..74be5b37 100755 --- a/genomepy/cli.py +++ b/genomepy/cli.py @@ -16,7 +16,7 @@ @click.group(context_settings=CONTEXT_SETTINGS) -@click.version_option(genomepy.__version__) +@click.version_option(genomepy.__version__, "-v", "--version") def cli(): pass # noqa diff --git a/genomepy/providers/__init__.py b/genomepy/providers/__init__.py index d6e786e7..b5027be8 100644 --- a/genomepy/providers/__init__.py +++ b/genomepy/providers/__init__.py @@ -1,16 +1,15 @@ """Provider class, modules & related functions""" import os -from typing import List, Optional +from typing import Optional import pandas as pd from loguru import logger from genomepy.files import read_readme -from genomepy.providers.base import ASM_FORMAT from genomepy.providers.ensembl import EnsemblProvider from genomepy.providers.gencode import GencodeProvider from genomepy.providers.local import LocalProvider -from genomepy.providers.ncbi import NcbiProvider +from genomepy.providers.ncbi import NcbiProvider, download_assembly_report from genomepy.providers.ucsc import UcscProvider from genomepy.providers.url import UrlProvider from genomepy.utils import get_genomes_dir, safe @@ -22,7 +21,6 @@ "search", "map_locations", "download_assembly_report", - "nearest_assembly", "online_providers", ] @@ -134,106 +132,6 @@ def search(term, provider: str = None): yield ret -def _closest_patch_lvl(reference, targets): - ref_patch = int(reference.split(".")[1]) if "." in reference else 0 - nearest = [999, []] - for target in targets: - tgt_patch = int(target.split(".")[1]) if "." in target else 0 - distance = abs(tgt_patch + 0.1 - ref_patch) # tiebreaker: newer > older patches - if distance == nearest[0]: - nearest[1].append(target) - if distance < nearest[0]: - nearest = [distance, [target]] - return nearest[1] - - -def _best_accession(reference: str, targets: list): - """Return the nearest accession ID from a list of IDs""" - if len(targets) == 1: - return targets[0] - - # GCA/GCF - matching_prefix = [t for t in targets if t.startswith(reference[0:3])] - if len(matching_prefix) > 0: - targets = matching_prefix - - # patch levels - # e.g. GCA_000002035.4 & GCA_000002035.3 - targets = _closest_patch_lvl(reference, targets) - - if len(targets) > 1: - logger.info( - f"Multiple matching accession numbers found. Selecting the closest ({targets[0]})." - ) - return targets[0] - - -def _best_search_result(asm_acc: str, results: List[list]) -> list: - """Return the best search result based on accession IDs.""" - results = [res for res in results if res[2] is not None] - - if len(results) > 1: - accessions = [res[2] for res in results] - bes_acc = _best_accession(asm_acc, accessions) - results = [res for res in results if res[2] == bes_acc] - - if len(results) > 0: - return results[0] - - -def nearest_assembly(asm_acc: str, provider: str) -> list: - """ - Return the search result of the assembly nearest to - the given accession ID, in the specified provider. - """ - if not asm_acc.startswith(("GCA_", "GCF_")): - raise ValueError("asm_acc must be an assembly accession ID.") - - search_results = list(search(asm_acc, provider=provider)) - best_result = _best_search_result(asm_acc, search_results) - if best_result is None: - logger.warning(f"No assembly similar to {asm_acc} on {provider}.") - return best_result - - -def download_assembly_report(asm_acc: str, fname: str = None): - """ - Retrieve the NCBI assembly report. - - Returns the assembly_report as a pandas DataFrame if fname is not specified. - - Parameters - ---------- - asm_acc : str - Assembly accession (GCA or GCF) - fname : str, optional - Save assembly_report to this filename. - - Returns - ------- - pandas.DataFrame - NCBI assembly report. - """ - search_result = nearest_assembly(asm_acc, "NCBI") - if search_result is None: - return - ncbi_acc = search_result[2] - ncbi_name = search_result[0] - - # NCBI FTP location of assembly report - assembly_report = ( - f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{ncbi_acc[0:3]}/" - + f"{ncbi_acc[4:7]}/{ncbi_acc[7:10]}/{ncbi_acc[10:13]}/" - + f"{ncbi_acc}_{ncbi_name}/{ncbi_acc}_{ncbi_name}_assembly_report.txt" - ) - asm_report = pd.read_csv(assembly_report, sep="\t", comment="#", names=ASM_FORMAT) - - if fname: - asm_report.to_csv(fname, sep="\t", index=False) - else: - return asm_report - - def map_locations( frm: str, to: str, genomes_dir: Optional[str] = None ) -> Optional[pd.DataFrame]: @@ -279,7 +177,7 @@ def map_locations( logger.warning("Cannot map without an assembly report.") return - asm_report = pd.read_csv(frm_asm_report, sep="\t", comment="#") + asm_report = pd.read_csv(frm_asm_report, sep="\t", comment="#", dtype=str) asm_report["ensembl_name"] = asm_report["Sequence-Name"] asm_report["ncbi_name"] = asm_report["Sequence-Name"] asm_report["ucsc_name"] = asm_report["UCSC-style-name"] diff --git a/genomepy/providers/base.py b/genomepy/providers/base.py index a9d30c07..1c708b88 100644 --- a/genomepy/providers/base.py +++ b/genomepy/providers/base.py @@ -18,19 +18,6 @@ from genomepy.online import download_file from genomepy.utils import get_genomes_dir, get_localname, lower, mkdir_p, rm_rf, safe -ASM_FORMAT = [ - "Sequence-Name", - "Sequence-Role", - "Assigned-Molecule", - "Assigned-Molecule-Location/Type", - "GenBank-Accn", - "Relationship", - "RefSeq-Accn", - "Assembly-Unit", - "Sequence-Length", - "UCSC-style-name", -] - class BaseProvider: """ @@ -111,7 +98,7 @@ def genome_taxid(self, name: str) -> int: if tid.isdigit(): return int(tid) - def assembly_accession(self, name: str) -> str: + def assembly_accession(self, name: str) -> str or None: """ Return the assembly accession number (GCA* or GCF*) for a genome. @@ -523,7 +510,11 @@ def rename_contigs(annot_file): genome_dir = os.path.dirname(os.path.dirname(annot_file)) asm_report = os.path.join(genome_dir, "assembly_report.txt") gencode2ucsc = pd.read_csv( - asm_report, sep="\t", comment="#", usecols=["GenBank-Accn", "UCSC-style-name"] + asm_report, + sep="\t", + comment="#", + usecols=["GenBank-Accn", "UCSC-style-name"], + dtype=str, ) gtf = read_annot(annot_file) diff --git a/genomepy/providers/gencode.py b/genomepy/providers/gencode.py index b8668fde..abd7b42b 100644 --- a/genomepy/providers/gencode.py +++ b/genomepy/providers/gencode.py @@ -1,16 +1,16 @@ import os from time import sleep -import pandas as pd from loguru import logger from genomepy.caching import cache from genomepy.exceptions import GenomeDownloadError from genomepy.files import update_readme from genomepy.online import check_url, connect_ftp_link -from genomepy.providers.base import ASM_FORMAT, BaseProvider, download_annotation +from genomepy.providers.base import BaseProvider, download_annotation +from genomepy.providers.ncbi import download_assembly_report from genomepy.providers.ucsc import UcscProvider -from genomepy.utils import get_genomes_dir, get_localname, mkdir_p +from genomepy.utils import get_genomes_dir, get_localname class GencodeProvider(BaseProvider): @@ -147,7 +147,7 @@ def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): # download exact assembly report to rename the scaffolds acc = self.assembly_accession(name) fname = os.path.join(genomes_dir, localname, "assembly_report.txt") - exact_assembly_report(name, acc, fname) + download_assembly_report(acc, fname) download_annotation(genomes_dir, link, localname) logger.info("Annotation download successful") @@ -286,14 +286,3 @@ def get_genomes(ftp_link): ftp.quit() return genomes - - -def exact_assembly_report(name, acc, fname): - assembly_report = ( - f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{acc[0:3]}/" - + f"{acc[4:7]}/{acc[7:10]}/{acc[10:13]}/" - + f"{acc}_{name}/{acc}_{name}_assembly_report.txt" - ) - asm_report = pd.read_csv(assembly_report, sep="\t", comment="#", names=ASM_FORMAT) - mkdir_p(os.path.dirname(fname)) - asm_report.to_csv(fname, sep="\t", index=False) diff --git a/genomepy/providers/ncbi.py b/genomepy/providers/ncbi.py index 99e7604c..4e1ce543 100644 --- a/genomepy/providers/ncbi.py +++ b/genomepy/providers/ncbi.py @@ -1,5 +1,6 @@ import os import re +import urllib.error from typing import List from urllib.request import urlopen @@ -9,9 +10,22 @@ from genomepy.caching import cache from genomepy.exceptions import GenomeDownloadError -from genomepy.online import check_url +from genomepy.online import check_url, read_url from genomepy.providers.base import BaseProvider -from genomepy.utils import safe +from genomepy.utils import mkdir_p, safe + +ASM_FORMAT = [ + "Sequence-Name", + "Sequence-Role", + "Assigned-Molecule", + "Assigned-Molecule-Location/Type", + "GenBank-Accn", + "Relationship", + "RefSeq-Accn", + "Assembly-Unit", + "Sequence-Length", + "UCSC-style-name", +] class NcbiProvider(BaseProvider): @@ -104,13 +118,12 @@ def _post_process_download(self, name, fname, out_dir, mask="soft"): """ # (down)load the assembly report asm_fname = os.path.join(out_dir, "assembly_report.txt") - if os.path.exists(asm_fname): - asm_report = pd.read_csv(asm_fname, sep="\t", comment="#", header=None) - else: - url = self._ftp_or_html_link(name, "_assembly_report.txt", True) - asm_report = pd.read_csv(url, sep="\t", comment="#", header=None) + if not os.path.exists(asm_fname): # save assembly report - asm_report.to_csv(asm_fname, sep="\t", index=False, header=False) + download_assembly_report(self.assembly_accession(name), asm_fname) + asm_report = pd.read_csv( + asm_fname, sep="\t", comment="#", header=None, dtype=str + ) # create mapping of chromosome accessions to chromosome names tr = asm_report.set_index(6)[0].to_dict() @@ -216,3 +229,82 @@ def load_summary(url): name = safe(line[15]) # overwrites older asm_names genomes[name] = dict(zip(header, line)) return genomes + + +def download_assembly_report(acc: str, fname: str = None): + """ + Retrieve the NCBI assembly report. + + Returns the assembly_report as a pandas DataFrame if fname is not specified. + + Parameters + ---------- + acc : str + Assembly accession (GCA or GCF) + fname : str, optional + Save assembly_report to this filename. + + Returns + ------- + pandas.DataFrame + NCBI assembly report. + """ + msg = "Could not download the assembly report from NCBI. " + if not isinstance(acc, str) or not acc.startswith(("GCA", "GCF")): + logger.warning(msg) + return None + assembly_report = _assembly_report_url(acc) + if assembly_report is None: + logger.warning(msg + f"Assembly accession '{acc}' not found.") + return None + asm_report = pd.read_csv( + assembly_report, sep="\t", comment="#", names=ASM_FORMAT, dtype=str + ) + + if fname: + mkdir_p(os.path.dirname(fname)) + asm_report.to_csv(fname, sep="\t", index=False) + else: + return asm_report + + +def _assembly_report_url(acc: str) -> str or None: + ftp_basedir = "https://ftp.ncbi.nlm.nih.gov/genomes/all" + asm_dir = f"{ftp_basedir}/{acc[0:3]}/{acc[4:7]}/{acc[7:10]}/{acc[10:13]}/" + try: + text = read_url(asm_dir) + except urllib.error.HTTPError: + return None + + # try to find exact accession patch level + hits = re.findall(acc + '_.*/"', text) # len 0-1 + if len(hits) == 0: + # patch level not found, pick the closest instead + hits = re.findall(acc + '.*/"', text) + available_accessions = ["_".join(h.split("_")[0:2]) for h in hits] + closest = _closest_patch_lvl(acc, available_accessions) + hits = [h for h in hits if closest + "_" in h] # len 0-1 + if len(hits) == 0: + return None + + name = hits[0][:-2] + assembly_report = asm_dir + name + "/" + name + "_assembly_report.txt" + return assembly_report + + +def _closest_patch_lvl(reference: str, targets: list) -> str: + ref_patch = _patch_lvl(reference) + closest = "" + closest_dist = 999 + for target in targets: + tgt_patch = _patch_lvl(target) + dist = abs(tgt_patch - 0.1 - ref_patch) # tiebreaker: newer > older patches + if dist < closest_dist: + closest = target + closest_dist = dist + return closest + + +def _patch_lvl(acc: str) -> int: + """str(GCA_000000000.6) -> int(6)""" + return int(acc.split(".")[1]) if "." in acc else 0 diff --git a/genomepy/providers/ucsc.py b/genomepy/providers/ucsc.py index c4eadcf1..7b7717d0 100644 --- a/genomepy/providers/ucsc.py +++ b/genomepy/providers/ucsc.py @@ -123,7 +123,7 @@ def _search_accession_ncbi(self, term: str) -> Iterator[str]: yield name break # max one hit per genome - def assembly_accession(self, name: str) -> str: + def assembly_accession(self, name: str) -> str or None: """ Return the assembly accession (`GCA_`/`GCF_`) for a genome. @@ -149,8 +149,6 @@ def assembly_accession(self, name: str) -> str: if acc: return acc - return "na" - def annotation_links(self, name, **kwargs) -> List[str]: """ Return a sorted list of available gene annotation types for a genome @@ -578,7 +576,7 @@ def download_annotation(name, annot, genomes_dir, localname, n=None): @cache -def scrape_accession(htmlpath: str) -> str: +def scrape_accession(htmlpath: str) -> str or None: """ Attempt to scrape the assembly accession (`GCA_`/`GCF_`) from a genome's readme.html, or any linked NCBI assembly pages can also be scraped. @@ -597,7 +595,7 @@ def scrape_accession(htmlpath: str) -> str: try: text = read_url(ucsc_url) except (UnicodeDecodeError, urllib.error.URLError): - return "na" + return None # example accessions: GCA_000004335.1 (ailMel1) # regex: GC[AF]_ = GCA_ or GCF_, \d = digit, \. = period @@ -613,12 +611,13 @@ def scrape_accession(htmlpath: str) -> str: try: text = read_url(ncbi_url) except (UnicodeDecodeError, urllib.error.URLError): - return "na" + return None # retrieve valid assembly accessions. # contains additional info, such as '(latest)' or '(suppressed)'. Unused for now. valid_accessions = re.findall(r"assembly accession:.*?GC[AF]_.*?<", text) text = " ".join(valid_accessions) + # this selects GCA > GCF if both are found match = accession_regex.search(text) if match: return match.group(0) diff --git a/tests/test_09_provider_ncbi.py b/tests/test_07_provider_ncbi.py similarity index 84% rename from tests/test_09_provider_ncbi.py rename to tests/test_07_provider_ncbi.py index 6f6291e8..21426554 100644 --- a/tests/test_09_provider_ncbi.py +++ b/tests/test_07_provider_ncbi.py @@ -2,6 +2,10 @@ from shutil import copyfile from tempfile import TemporaryDirectory +import pandas as pd + +import genomepy + def test_ncbiprovider(ncbi): assert ncbi.name == "NCBI" @@ -64,3 +68,12 @@ def test__get_genomes(ncbi): assert field in genome assert genome["species_taxid"] == "2097" assert genome["taxid"] == "243273" + + +def test_download_assembly_report(): + assembly_report = "tests/data/sacCer3/assembly_report.txt" + genomepy.providers.download_assembly_report("GCA_000146045", assembly_report) + report = pd.read_csv(assembly_report, sep="\t", comment="#") + + assert isinstance(report, pd.DataFrame) + assert list(report.columns) == genomepy.providers.ncbi.ASM_FORMAT diff --git a/tests/test_08_provider_ucsc.py b/tests/test_08_provider_ucsc.py index 548a7a84..f7abca20 100644 --- a/tests/test_08_provider_ucsc.py +++ b/tests/test_08_provider_ucsc.py @@ -17,10 +17,24 @@ def test__search_accession(ucsc): assert next(ucsc._search_accession("GCA_000004335.1")) == "ailMel1" +def test__search_accession_ncbi(ucsc): + ret = list(ucsc._search_accession_ncbi("GCF_000004195.2")) + expected = ["xenTro1", "xenTro10", "xenTro2", "xenTro3", "xenTro7", "xenTro9"] + assert ret == expected + + def test_assembly_accession(ucsc): + # in UCSC(, not in NCBI) accession = ucsc.assembly_accession("sacCer3") + assert accession == "GCA_000146045.2" + + # not in UCSC, not in NCBI + accession = ucsc.assembly_accession("sacCer1") + assert accession is None - assert accession.startswith("GCA_000146045") + # not in UCSC, in NCBI + accession = ucsc.assembly_accession("xenTro7") + assert accession == "GCA_000004195.2" def test_annotation_links(ucsc): @@ -107,6 +121,10 @@ def test_get_annotation_download_link(ucsc): ucsc.get_annotation_download_link(genome, **{"ucsc_annotation_type": "what?"}) +def test_download_annotation(ucsc): + ucsc.download_annotation("sacCer1") + + def test_get_genomes(ucsc): assert isinstance(ucsc.genomes, dict) assert "ailMel1" in ucsc.genomes diff --git a/tests/test_07_provider_ensembl.py b/tests/test_09_provider_ensembl.py similarity index 100% rename from tests/test_07_provider_ensembl.py rename to tests/test_09_provider_ensembl.py diff --git a/tests/test_11_provider.py b/tests/test_11_provider.py index 2d937e73..43d75ac1 100644 --- a/tests/test_11_provider.py +++ b/tests/test_11_provider.py @@ -60,169 +60,6 @@ def test_search(): assert "8364" in str(metadata[3]) -def test__best_search_result(): - # ncbi_human = list(genomepy.search("GCA_000001405", provider="NCBI")) - ncbi_human = [ - [ - "GRCh38", - "NCBI", - "GCF_000001405.26", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p1", - "NCBI", - "GCF_000001405.27", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p2", - "NCBI", - "GCF_000001405.28", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p3", - "NCBI", - "GCF_000001405.29", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p4", - "NCBI", - "GCF_000001405.30", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p5", - "NCBI", - "GCF_000001405.31", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p6", - "NCBI", - "GCF_000001405.32", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p7", - "NCBI", - "GCF_000001405.33", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p8", - "NCBI", - "GCF_000001405.34", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p9", - "NCBI", - "GCF_000001405.35", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p10", - "NCBI", - "GCF_000001405.36", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p11", - "NCBI", - "GCF_000001405.37", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p12", - "NCBI", - "GCF_000001405.38", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh37.p13", - "NCBI", - "GCF_000001405.25", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - [ - "GRCh38.p13", - "NCBI", - "GCF_000001405.39", - 9606, - True, - "Homo sapiens", - "Genome Reference Consortium", - ], - ] - - # list of search results - best_result = genomepy.providers._best_search_result("GCA_000001405.39", ncbi_human) - assert best_result[2] == "GCF_000001405.39" - - # zero search results - assert genomepy.providers._best_search_result("GCA_000001405.39", []) is None - - # one search results - best_result = genomepy.providers._best_search_result( - "GCA_000001405.26", [ncbi_human[0]] - ) - assert best_result[2] == ncbi_human[0][2] - - -def test_download_assembly_report(): - assembly_report = "tests/data/sacCer3/assembly_report.txt" - genomepy.providers.download_assembly_report("GCA_000146045", assembly_report) - report = pd.read_csv(assembly_report, sep="\t", comment="#") - - assert isinstance(report, pd.DataFrame) - assert list(report.columns) == genomepy.providers.ASM_FORMAT - - def test_map_location(): mapping = genomepy.providers.map_locations( frm="sacCer3", to="ensembl", genomes_dir="tests/data" diff --git a/tests/test_13_annotation.py b/tests/test_13_annotation.py index ae1b6f74..774fafbd 100644 --- a/tests/test_13_annotation.py +++ b/tests/test_13_annotation.py @@ -47,7 +47,7 @@ def test__get_name_and_dir(): assert genome_dir == genomepy.utils.cleanpath(td["expected_dir"]), td # does not exist - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): genomepy.annotation._get_name_and_dir("tests/data/GRCz11/what.who") # no bed/gtf/fa with pytest.raises(NotImplementedError): @@ -83,7 +83,7 @@ def test_annotation_init(caplog, annot): assert "Could not find 'empty.annotation.gtf" in caplog.text # Genome doesn't exist - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): genomepy.Annotation("never_existed", genomes_dir="tests/data") diff --git a/tests/test_14_functions.py b/tests/test_14_functions.py index 05833192..f7778c8d 100644 --- a/tests/test_14_functions.py +++ b/tests/test_14_functions.py @@ -20,7 +20,7 @@ def test_list_available_genomes(): g = genomepy.functions.list_available_genomes("Ensembl") metadata = next(g) assert isinstance(metadata, list) - assert metadata[0:2] == ["athCun1", "Ensembl"] + assert metadata[0:2] == ["mCalJac1.pat.X", "Ensembl"] def test_list_installed_genomes():