Skip to content

Commit

Permalink
Merge branch 'release/0.11.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
siebrenf committed Jan 6, 2022
2 parents 603cab7 + 3b3a072 commit 369c465
Show file tree
Hide file tree
Showing 17 changed files with 184 additions and 378 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [Unreleased]

## [0.11.1] - 2022-01-06

### Added
- `quiet` flag for `genomepy.Annotation`
- `genomepy -v` flag

### Changed
- `genomepy.Annotation` returns a `FileNotFoundError` instead of a `ValueError` where appropriate.
- `download_assembly_report` refactored. Now downloads the report for the exact same assembly accession (and not the nearest NCBI assembly).
- broader unit tests for UCSC assembly accession scraping


### Fixed
- inconsistent behaviour with assembly reports (#193 + #194)

## [0.11.0] - 2021-11-18

### Added
Expand Down Expand Up @@ -368,6 +383,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `-r` and `--match/--no-match` option to select sequences by regex.

[Unreleased]: https://github.com/vanheeringen-lab/genomepy/compare/master...develop
[0.11.1]: https://github.com/vanheeringen-lab/genomepy/compare/0.11.0...0.11.1
[0.11.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.10.0...0.11.0
[0.10.0]: https://github.com/vanheeringen-lab/genomepy/compare/0.9.3...0.10.0
[0.9.3]: https://github.com/vanheeringen-lab/genomepy/compare/0.9.2...0.9.3
Expand Down
14 changes: 7 additions & 7 deletions docs/release_checklist.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

1. Make sure all tests pass.

```shell
pytest -vv --disable-pytest-warnings
```
```shell
pytest -vv --disable-pytest-warnings
```

2. Create release candidate with `git flow`:

Expand Down Expand Up @@ -58,13 +58,13 @@

```shell
git flow release finish ${new_version}
```
```

7. Push everything to github, including tags:

```shell
git push --follow-tags origin develop
```
```shell
git push --follow-tags origin develop
```

8. Pull into master

Expand Down
2 changes: 1 addition & 1 deletion genomepy/__about__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Metadata"""
__version__ = "0.11.0"
__version__ = "0.11.1"
__author__ = "Simon van Heeringen, Siebren Frölich, Maarten van der Sande"
10 changes: 6 additions & 4 deletions genomepy/annotation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class Annotation:
Genome name/directory/fasta or gene annotation BED/GTF file.
genomes_dir : str, optional
Genomes installation directory.
quiet : bool, optional
Silence init warnings
Returns
-------
Expand All @@ -52,7 +54,7 @@ class Annotation:
annotation_contigs: list = None
"Contigs found in the gene annotation BED"

def __init__(self, name: str, genomes_dir: str = None):
def __init__(self, name: str, genomes_dir: str = None, quiet: bool = False):
# name and directory
n, g = _get_name_and_dir(name, genomes_dir)
self.name = n
Expand All @@ -65,12 +67,12 @@ def __init__(self, name: str, genomes_dir: str = None):
suffixes = Path(fname).suffixes[-2:]
b = fname
if not (".bed" in suffixes or ".BED" in suffixes):
b = _get_file(self.genome_dir, f"{self.name}.annotation.bed")
b = _get_file(self.genome_dir, f"{self.name}.annotation.bed", not quiet)
self.annotation_bed_file = b
"path to the gene annotation BED file"
g = fname
if not (".gtf" in suffixes or ".GTF" in suffixes):
g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf")
g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf", not quiet)
self.annotation_gtf_file = g
"path to the gene annotation GTF file"

Expand Down Expand Up @@ -385,7 +387,7 @@ def _get_name_and_dir(name, genomes_dir=None):
elif name in os.listdir(genomes_dir):
genome_dir = os.path.join(genomes_dir, name)
else:
raise ValueError(f"Could not find {name}")
raise FileNotFoundError(f"Could not find {name}")
return name, genome_dir


Expand Down
49 changes: 0 additions & 49 deletions genomepy/annotation/mygene.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,55 +161,6 @@ def _parse_mygene_input(field, product=None):
return field, product


# def ensembl_genome_info(self) -> Optional[Tuple[str, str, str]]:
# """
# Return Ensembl genome information for this genome.
#
# Requires accession numbers to match (excluding patch numbers)
#
# Returns
# -------
# (str, str, str)
# Ensembl name, accession, taxonomy_id
# """
# _check_property(self.readme_file, "README.txt")
#
# metadata, _ = read_readme(self.readme_file)
# if metadata.get("provider") == "Ensembl":
# return metadata["name"], metadata["assembly_accession"], metadata["tax_id"]
#
# if metadata.get("assembly_accession", "na") == "na":
# logger.warning("Cannot find a matching genome without an assembly accession.")
# return
#
# asm_acc = metadata["assembly_accession"]
# search_result = nearest_assembly(asm_acc, "Ensembl")
# if search_result is None:
# return
#
# return safe(search_result[0]), search_result[2], search_result[3]
#
#
# def _query_mygene(
# self,
# query: Iterable[str],
# field: str = "genomic_pos",
# ) -> pd.DataFrame:
# # mygene.info only queries the most recent version of the Ensembl database
# # We can only safely continue if the local genome matched the Ensembl genome.
# # Even if the local genome was installed via Ensembl, we still need to check
# # if it is the same version
# ensembl_info = ensembl_genome_info(self)
# if ensembl_info is None:
# return pd.DataFrame()
#
# tax_id = ensembl_info[2]
# if not str(tax_id).isdigit():
# raise ValueError("No taxomoy ID found")
#
# return query_mygene(query, tax_id, field)


def _filter_query(query: pd.DataFrame) -> pd.DataFrame:
"""per queried gene, keep the best matching, non-NaN, mapping"""
if "notfound" in query:
Expand Down
2 changes: 1 addition & 1 deletion genomepy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


@click.group(context_settings=CONTEXT_SETTINGS)
@click.version_option(genomepy.__version__)
@click.version_option(genomepy.__version__, "-v", "--version")
def cli():
pass # noqa

Expand Down
108 changes: 3 additions & 105 deletions genomepy/providers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
"""Provider class, modules & related functions"""
import os
from typing import List, Optional
from typing import Optional

import pandas as pd
from loguru import logger

from genomepy.files import read_readme
from genomepy.providers.base import ASM_FORMAT
from genomepy.providers.ensembl import EnsemblProvider
from genomepy.providers.gencode import GencodeProvider
from genomepy.providers.local import LocalProvider
from genomepy.providers.ncbi import NcbiProvider
from genomepy.providers.ncbi import NcbiProvider, download_assembly_report
from genomepy.providers.ucsc import UcscProvider
from genomepy.providers.url import UrlProvider
from genomepy.utils import get_genomes_dir, safe
Expand All @@ -22,7 +21,6 @@
"search",
"map_locations",
"download_assembly_report",
"nearest_assembly",
"online_providers",
]

Expand Down Expand Up @@ -134,106 +132,6 @@ def search(term, provider: str = None):
yield ret


def _closest_patch_lvl(reference, targets):
ref_patch = int(reference.split(".")[1]) if "." in reference else 0
nearest = [999, []]
for target in targets:
tgt_patch = int(target.split(".")[1]) if "." in target else 0
distance = abs(tgt_patch + 0.1 - ref_patch) # tiebreaker: newer > older patches
if distance == nearest[0]:
nearest[1].append(target)
if distance < nearest[0]:
nearest = [distance, [target]]
return nearest[1]


def _best_accession(reference: str, targets: list):
"""Return the nearest accession ID from a list of IDs"""
if len(targets) == 1:
return targets[0]

# GCA/GCF
matching_prefix = [t for t in targets if t.startswith(reference[0:3])]
if len(matching_prefix) > 0:
targets = matching_prefix

# patch levels
# e.g. GCA_000002035.4 & GCA_000002035.3
targets = _closest_patch_lvl(reference, targets)

if len(targets) > 1:
logger.info(
f"Multiple matching accession numbers found. Selecting the closest ({targets[0]})."
)
return targets[0]


def _best_search_result(asm_acc: str, results: List[list]) -> list:
"""Return the best search result based on accession IDs."""
results = [res for res in results if res[2] is not None]

if len(results) > 1:
accessions = [res[2] for res in results]
bes_acc = _best_accession(asm_acc, accessions)
results = [res for res in results if res[2] == bes_acc]

if len(results) > 0:
return results[0]


def nearest_assembly(asm_acc: str, provider: str) -> list:
"""
Return the search result of the assembly nearest to
the given accession ID, in the specified provider.
"""
if not asm_acc.startswith(("GCA_", "GCF_")):
raise ValueError("asm_acc must be an assembly accession ID.")

search_results = list(search(asm_acc, provider=provider))
best_result = _best_search_result(asm_acc, search_results)
if best_result is None:
logger.warning(f"No assembly similar to {asm_acc} on {provider}.")
return best_result


def download_assembly_report(asm_acc: str, fname: str = None):
"""
Retrieve the NCBI assembly report.
Returns the assembly_report as a pandas DataFrame if fname is not specified.
Parameters
----------
asm_acc : str
Assembly accession (GCA or GCF)
fname : str, optional
Save assembly_report to this filename.
Returns
-------
pandas.DataFrame
NCBI assembly report.
"""
search_result = nearest_assembly(asm_acc, "NCBI")
if search_result is None:
return
ncbi_acc = search_result[2]
ncbi_name = search_result[0]

# NCBI FTP location of assembly report
assembly_report = (
f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{ncbi_acc[0:3]}/"
+ f"{ncbi_acc[4:7]}/{ncbi_acc[7:10]}/{ncbi_acc[10:13]}/"
+ f"{ncbi_acc}_{ncbi_name}/{ncbi_acc}_{ncbi_name}_assembly_report.txt"
)
asm_report = pd.read_csv(assembly_report, sep="\t", comment="#", names=ASM_FORMAT)

if fname:
asm_report.to_csv(fname, sep="\t", index=False)
else:
return asm_report


def map_locations(
frm: str, to: str, genomes_dir: Optional[str] = None
) -> Optional[pd.DataFrame]:
Expand Down Expand Up @@ -279,7 +177,7 @@ def map_locations(
logger.warning("Cannot map without an assembly report.")
return

asm_report = pd.read_csv(frm_asm_report, sep="\t", comment="#")
asm_report = pd.read_csv(frm_asm_report, sep="\t", comment="#", dtype=str)
asm_report["ensembl_name"] = asm_report["Sequence-Name"]
asm_report["ncbi_name"] = asm_report["Sequence-Name"]
asm_report["ucsc_name"] = asm_report["UCSC-style-name"]
Expand Down
21 changes: 6 additions & 15 deletions genomepy/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,6 @@
from genomepy.online import download_file
from genomepy.utils import get_genomes_dir, get_localname, lower, mkdir_p, rm_rf, safe

ASM_FORMAT = [
"Sequence-Name",
"Sequence-Role",
"Assigned-Molecule",
"Assigned-Molecule-Location/Type",
"GenBank-Accn",
"Relationship",
"RefSeq-Accn",
"Assembly-Unit",
"Sequence-Length",
"UCSC-style-name",
]


class BaseProvider:
"""
Expand Down Expand Up @@ -111,7 +98,7 @@ def genome_taxid(self, name: str) -> int:
if tid.isdigit():
return int(tid)

def assembly_accession(self, name: str) -> str:
def assembly_accession(self, name: str) -> str or None:
"""
Return the assembly accession number (GCA* or GCF*) for a genome.
Expand Down Expand Up @@ -523,7 +510,11 @@ def rename_contigs(annot_file):
genome_dir = os.path.dirname(os.path.dirname(annot_file))
asm_report = os.path.join(genome_dir, "assembly_report.txt")
gencode2ucsc = pd.read_csv(
asm_report, sep="\t", comment="#", usecols=["GenBank-Accn", "UCSC-style-name"]
asm_report,
sep="\t",
comment="#",
usecols=["GenBank-Accn", "UCSC-style-name"],
dtype=str,
)
gtf = read_annot(annot_file)

Expand Down
Loading

0 comments on commit 369c465

Please sign in to comment.