diff --git a/bioframe/io/resources.py b/bioframe/io/resources.py index bedde12..fe892f7 100644 --- a/bioframe/io/resources.py +++ b/bioframe/io/resources.py @@ -1,23 +1,14 @@ +import urllib from functools import partial -from urllib.parse import urljoin, urlencode -import urllib -import os -import posixpath as pp -import os.path as op +from urllib.parse import urljoin +from typing import Union + import numpy as np import pandas as pd -import requests -import socket -import base64 -import glob - -import pkg_resources -from .schemas import SCHEMAS, UCSC_MRNA_FIELDS -from .fileops import ( - read_table, - read_chromsizes, -) +from .assembly import assembly_info +from .fileops import read_chromsizes, read_table +from .schemas import SCHEMAS __all__ = [ "fetch_chromsizes", @@ -26,59 +17,84 @@ ] -def _check_connectivity(reference="http://www.google.com"): - try: - urllib.request.urlopen(reference, timeout=5) - return True - except urllib.request.URLError: - return False - except socket.timeout: - return False - - def fetch_chromsizes( - db, - provider=None, - filter_chroms=True, - chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), - natsort=True, - as_bed=False, + db: str, + *, + provider: str = "local", + as_bed: bool = False, + filter_chroms: bool = True, + chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), + natsort: bool = True, **kwargs, -): +) -> Union[pd.Series, pd.DataFrame]: """ - Fetch chromsizes from the UCSC database or local storage. + Fetch chromsizes from local storage or the UCSC database. Parameters ---------- - provider : str - The provider of chromsizes. Currently, only 'ucsc' is implemented. + db : str + Assembly name. + provider : str, optional [default: "local"] + The provider of chromsizes. Either "local" for local storage or "ucsc". + as_bed : bool, optional + If True, return chromsizes as an interval DataFrame (chrom, start, end) + instead of a Series. + + The remaining options only apply to provider="ucsc". + filter_chroms : bool, optional Filter for chromosome names given in ``chrom_patterns``. chrom_patterns : sequence, optional Sequence of regular expressions to capture desired sequence names. natsort : bool, optional Sort each captured group of names in natural order. Default is True. - as_bed : bool, optional - If True, return chromsizes as an interval dataframe (chrom, start, end). **kwargs : Passed to :func:`pandas.read_csv` Returns ------- - Series of integer bp lengths indexed by sequence name or an interval dataframe. + Series of integer bp lengths indexed by sequence name or BED3 DataFrame. + Notes + ----- + For more fine-grained control over the chromsizes from local storage, + use :func:`bioframe.assembly_info`. + + Examples + -------- + >>> fetch_chromsizes("hg38") + name + chr1 248956422 + chr2 242193529 + chr3 198295559 + ... ... + chrX 156040895 + chrY 57227415 + chrM 16569 + Name: length, dtype: int64 + + >>> fetch_chromsizes("hg38", as_bed=True) + chrom start end + 0 chr1 0 248956422 + 1 chr2 0 242193529 + 2 chr3 0 198295559 + ... ... + 21 chrX 0 156040895 + 22 chrY 0 57227415 + 23 chrM 0 16569 + + See also + -------- + bioframe.assembly_info + bioframe.UCSCClient """ - if provider == "local": - fpath = f"data/{db}.chrom.sizes" - if pkg_resources.resource_exists("bioframe.io", fpath): - return read_chromsizes( - pkg_resources.resource_filename("bioframe.io", fpath) - ) + assembly = assembly_info(db) + if as_bed: + return assembly.viewframe[["chrom", "start", "end"]].copy() else: - raise LookupError(f"Assembly '{db}' not found in local storage") - - if provider == "ucsc" or provider is None: + return assembly.chromsizes + elif provider == "ucsc": return UCSCClient(db).fetch_chromsizes( filter_chroms=filter_chroms, chrom_patterns=chrom_patterns, @@ -90,7 +106,9 @@ def fetch_chromsizes( raise ValueError("Unknown provider '{}'".format(provider)) -def _origins_from_cytoband(cyb, band_col="gieStain"): +def _origins_from_cytoband( + cyb: pd.DataFrame, band_col: str = "gieStain" +) -> pd.DataFrame: """ Extract chromosomal origin positions separating chromosome arms from cytological band data. Takes the cytological origin, i.e. the boundary @@ -124,7 +142,7 @@ def _origins_from_cytoband(cyb, band_col="gieStain"): return pd.DataFrame.from_records(cens) -def _origins_from_ucsccentromeres(cens): +def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame: """ Extract chromosomal origin positions from UCSC centromeres.txt table describing centromere model sequences. Takes the midpoint of all @@ -153,7 +171,7 @@ def _origins_from_ucsccentromeres(cens): return cens -def fetch_centromeres(db, provider=None, merge=True, verbose=False): +def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame: """ Extract centromere locations for a given assembly 'db' from a variety of file formats in UCSC (cytoband, centromeres) depending on @@ -161,13 +179,11 @@ def fetch_centromeres(db, provider=None, merge=True, verbose=False): Parameters ---------- - db : str - - merge : bool - Whether to merge all centromere intervals per chromosome into - one consolidated centromere interval. - Default True. + Assembly name. + provider : str, optional [default: "local"] + The provider of centromere data. Either "local" for local storage + or "ucsc". Returns ------- @@ -175,27 +191,33 @@ def fetch_centromeres(db, provider=None, merge=True, verbose=False): Notes ----- - The priority goes as - - Local (not implemented) - - centromeres.txt - - cytoBandIdeo - - cytoBand + When provider="local", centromeres are derived from cytoband tables + in local storage. - Gap files no longer provide centromere information. + Whe provider="ucsc", the fallback priority goes as follows: + - UCSC cytoBand + - UCSC cytoBandIdeo + - UCSC centromeres.txt + + Note that UCSC "gap" files no longer provide centromere information. + Currently only works for human assemblies. + See also + -------- + bioframe.assembly_info + bioframe.UCSCClient """ if provider == "local": - raise NotImplementedError("local method not currently implemented") - fpath = f"data/{db}.centromeres" - if pkg_resources.resource_exists("bioframe.io", fpath): - return read_chromsizes( - pkg_resources.resource_filename("bioframe.io", fpath) + assembly = assembly_info(db) + cyb = assembly.cytobands + if cyb is None: + raise ValueError( + f"No source for centromere data found from provider '{provider}'." ) - else: - raise LookupError(f"Centromeres for '{db}' not found in local storage") + return _origins_from_cytoband(cyb, band_col="stain") - if provider == "ucsc" or provider is None: + elif provider == "ucsc": client = UCSCClient(db) fetchers = [ ("cytoband", client.fetch_cytoband), @@ -210,31 +232,34 @@ def fetch_centromeres(db, provider=None, merge=True, verbose=False): except urllib.error.HTTPError: pass else: - raise ValueError("No source for centromere data found.") - else: - raise NotImplementedError("currently UCSC is only implemented provider") + raise ValueError( + f"No source for centromere data found from provider '{provider}'." + ) + + if schema == "centromeres": + return _origins_from_ucsccentromeres(df) + else: + return _origins_from_cytoband(df) - if schema == "centromeres": - return _origins_from_ucsccentromeres(df) else: - return _origins_from_cytoband(df) + raise ValueError("Unknown provider '{}'".format(provider)) class UCSCClient: BASE_URL = "http://hgdownload.cse.ucsc.edu/" - def __init__(self, db): + def __init__(self, db: str): self._db = db self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/") def fetch_chromsizes( self, - filter_chroms=True, - chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), - natsort=True, - as_bed=False, + filter_chroms: bool = True, + chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), + natsort: bool = True, + as_bed: bool = False, **kwargs, - ): + ) -> Union[pd.Series, pd.DataFrame]: url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes") return read_chromsizes( url, @@ -245,9 +270,9 @@ def fetch_chromsizes( **kwargs, ) - def fetch_centromeres(self, **kwargs): + def fetch_centromeres(self, **kwargs) -> pd.DataFrame: url = urljoin(self._db_url, "database/centromeres.txt.gz") - return read_table(url, schema="centromeres") + return read_table(url, schema="centromeres", **kwargs) def fetch_gaps(self, **kwargs): url = urljoin(self._db_url, "database/gap.txt.gz") @@ -258,19 +283,17 @@ def fetch_gaps(self, **kwargs): **kwargs, ) - def fetch_cytoband(self, ideo=False, **kwargs): + def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame: if ideo: url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz") else: url = urljoin(self._db_url, "database/cytoBand.txt.gz") return read_table(url, schema="cytoband") - def fetch_mrna(self, **kwargs): + def fetch_mrna(self, **kwargs) -> pd.DataFrame: url = urljoin(self._db_url, "database/all_mrna.txt.gz") return read_table( url, - schema=UCSC_MRNA_FIELDS, + schema=SCHEMAS["all_mrna"], **kwargs, ) - - diff --git a/bioframe/io/schemas.py b/bioframe/io/schemas.py index c73be06..fa820df 100644 --- a/bioframe/io/schemas.py +++ b/bioframe/io/schemas.py @@ -192,7 +192,7 @@ "vcf": VCF_FIELDS, "jaspar": JASPAR_FIELDS, "gap": GAP_FIELDS, - "UCSCmRNA": UCSC_MRNA_FIELDS, + "all_mrna": UCSC_MRNA_FIELDS, "pgsnp": PGSNP_FIELDS, } diff --git a/tests/test_io.py b/tests/test_fileops.py similarity index 100% rename from tests/test_io.py rename to tests/test_fileops.py diff --git a/tests/test_resources.py b/tests/test_resources.py new file mode 100644 index 0000000..0e984d6 --- /dev/null +++ b/tests/test_resources.py @@ -0,0 +1,43 @@ +import pandas as pd +import numpy as np +import pytest + +import bioframe + + +def test_fetch_chromsizes(): + db = "hg38" + for provider in ["local", "ucsc"]: + chromsizes = bioframe.fetch_chromsizes(db, provider=provider) + assert isinstance(chromsizes, pd.Series) + assert chromsizes.name == "length" + assert len(chromsizes) == 25 + + chromsizes_df = bioframe.fetch_chromsizes( + db, provider=provider, as_bed=True + ) + assert isinstance(chromsizes_df, pd.DataFrame) + assert list(chromsizes_df.columns) == ["chrom", "start", "end"] + assert len(chromsizes_df) == 25 + + # Check synonymous local assemblies + assert bioframe.fetch_chromsizes("hg38", provider="local").equals( + bioframe.fetch_chromsizes("GRCh38", provider="local") + ) + + +def test_fetch_chromsizes_local_vs_ucsc(): + for db in ["hg19", "hg38", "mm9", "mm10"]: + assert bioframe.fetch_chromsizes(db, provider="local").equals( + bioframe.fetch_chromsizes(db, provider="ucsc") + ) + + +def test_fetch_centromeres(): + for db in ["hg19", "hg38"]: + # Note: UCSC will usually have a different ordering of chromosomes + for provider in ["local", "ucsc"]: + centromeres = bioframe.fetch_centromeres(db, provider=provider) + assert isinstance(centromeres, pd.DataFrame) + assert list(centromeres.columns) == ["chrom", "start", "end", "mid"] + assert len(centromeres) == 24