Skip to content

Commit

Permalink
Implement local providers for fetch_chromsizes and fetch_centromeres (#…
Browse files Browse the repository at this point in the history
…156)

* Implement local providers for fetch_chromsizes and fetch_centromeres

* Use old Union typing syntax
  • Loading branch information
nvictus authored May 30, 2023
1 parent 380bf96 commit 561a4f2
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 91 deletions.
203 changes: 113 additions & 90 deletions bioframe/io/resources.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
import urllib
from functools import partial
from urllib.parse import urljoin, urlencode
import urllib
import os
import posixpath as pp
import os.path as op
from urllib.parse import urljoin
from typing import Union

import numpy as np
import pandas as pd
import requests
import socket
import base64
import glob

import pkg_resources

from .schemas import SCHEMAS, UCSC_MRNA_FIELDS
from .fileops import (
read_table,
read_chromsizes,
)
from .assembly import assembly_info
from .fileops import read_chromsizes, read_table
from .schemas import SCHEMAS

__all__ = [
"fetch_chromsizes",
Expand All @@ -26,59 +17,84 @@
]


def _check_connectivity(reference="http://www.google.com"):
try:
urllib.request.urlopen(reference, timeout=5)
return True
except urllib.request.URLError:
return False
except socket.timeout:
return False


def fetch_chromsizes(
db,
provider=None,
filter_chroms=True,
chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
natsort=True,
as_bed=False,
db: str,
*,
provider: str = "local",
as_bed: bool = False,
filter_chroms: bool = True,
chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
natsort: bool = True,
**kwargs,
):
) -> Union[pd.Series, pd.DataFrame]:
"""
Fetch chromsizes from the UCSC database or local storage.
Fetch chromsizes from local storage or the UCSC database.
Parameters
----------
provider : str
The provider of chromsizes. Currently, only 'ucsc' is implemented.
db : str
Assembly name.
provider : str, optional [default: "local"]
The provider of chromsizes. Either "local" for local storage or "ucsc".
as_bed : bool, optional
If True, return chromsizes as an interval DataFrame (chrom, start, end)
instead of a Series.
The remaining options only apply to provider="ucsc".
filter_chroms : bool, optional
Filter for chromosome names given in ``chrom_patterns``.
chrom_patterns : sequence, optional
Sequence of regular expressions to capture desired sequence names.
natsort : bool, optional
Sort each captured group of names in natural order. Default is True.
as_bed : bool, optional
If True, return chromsizes as an interval dataframe (chrom, start, end).
**kwargs :
Passed to :func:`pandas.read_csv`
Returns
-------
Series of integer bp lengths indexed by sequence name or an interval dataframe.
Series of integer bp lengths indexed by sequence name or BED3 DataFrame.
Notes
-----
For more fine-grained control over the chromsizes from local storage,
use :func:`bioframe.assembly_info`.
Examples
--------
>>> fetch_chromsizes("hg38")
name
chr1 248956422
chr2 242193529
chr3 198295559
... ...
chrX 156040895
chrY 57227415
chrM 16569
Name: length, dtype: int64
>>> fetch_chromsizes("hg38", as_bed=True)
chrom start end
0 chr1 0 248956422
1 chr2 0 242193529
2 chr3 0 198295559
... ...
21 chrX 0 156040895
22 chrY 0 57227415
23 chrM 0 16569
See also
--------
bioframe.assembly_info
bioframe.UCSCClient
"""

if provider == "local":
fpath = f"data/{db}.chrom.sizes"
if pkg_resources.resource_exists("bioframe.io", fpath):
return read_chromsizes(
pkg_resources.resource_filename("bioframe.io", fpath)
)
assembly = assembly_info(db)
if as_bed:
return assembly.viewframe[["chrom", "start", "end"]].copy()
else:
raise LookupError(f"Assembly '{db}' not found in local storage")

if provider == "ucsc" or provider is None:
return assembly.chromsizes
elif provider == "ucsc":
return UCSCClient(db).fetch_chromsizes(
filter_chroms=filter_chroms,
chrom_patterns=chrom_patterns,
Expand All @@ -90,7 +106,9 @@ def fetch_chromsizes(
raise ValueError("Unknown provider '{}'".format(provider))


def _origins_from_cytoband(cyb, band_col="gieStain"):
def _origins_from_cytoband(
cyb: pd.DataFrame, band_col: str = "gieStain"
) -> pd.DataFrame:
"""
Extract chromosomal origin positions separating chromosome arms from
cytological band data. Takes the cytological origin, i.e. the boundary
Expand Down Expand Up @@ -124,7 +142,7 @@ def _origins_from_cytoband(cyb, band_col="gieStain"):
return pd.DataFrame.from_records(cens)


def _origins_from_ucsccentromeres(cens):
def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
"""
Extract chromosomal origin positions from UCSC centromeres.txt table
describing centromere model sequences. Takes the midpoint of all
Expand Down Expand Up @@ -153,49 +171,53 @@ def _origins_from_ucsccentromeres(cens):
return cens


def fetch_centromeres(db, provider=None, merge=True, verbose=False):
def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
"""
Extract centromere locations for a given assembly 'db' from a variety
of file formats in UCSC (cytoband, centromeres) depending on
availability, returning a DataFrame.
Parameters
----------
db : str
merge : bool
Whether to merge all centromere intervals per chromosome into
one consolidated centromere interval.
Default True.
Assembly name.
provider : str, optional [default: "local"]
The provider of centromere data. Either "local" for local storage
or "ucsc".
Returns
-------
DataFrame with centromere 'chrom', 'start', 'end', 'mid'.
Notes
-----
The priority goes as
- Local (not implemented)
- centromeres.txt
- cytoBandIdeo
- cytoBand
When provider="local", centromeres are derived from cytoband tables
in local storage.
Gap files no longer provide centromere information.
Whe provider="ucsc", the fallback priority goes as follows:
- UCSC cytoBand
- UCSC cytoBandIdeo
- UCSC centromeres.txt
Note that UCSC "gap" files no longer provide centromere information.
Currently only works for human assemblies.
See also
--------
bioframe.assembly_info
bioframe.UCSCClient
"""
if provider == "local":
raise NotImplementedError("local method not currently implemented")
fpath = f"data/{db}.centromeres"
if pkg_resources.resource_exists("bioframe.io", fpath):
return read_chromsizes(
pkg_resources.resource_filename("bioframe.io", fpath)
assembly = assembly_info(db)
cyb = assembly.cytobands
if cyb is None:
raise ValueError(
f"No source for centromere data found from provider '{provider}'."
)
else:
raise LookupError(f"Centromeres for '{db}' not found in local storage")
return _origins_from_cytoband(cyb, band_col="stain")

if provider == "ucsc" or provider is None:
elif provider == "ucsc":
client = UCSCClient(db)
fetchers = [
("cytoband", client.fetch_cytoband),
Expand All @@ -210,31 +232,34 @@ def fetch_centromeres(db, provider=None, merge=True, verbose=False):
except urllib.error.HTTPError:
pass
else:
raise ValueError("No source for centromere data found.")
else:
raise NotImplementedError("currently UCSC is only implemented provider")
raise ValueError(
f"No source for centromere data found from provider '{provider}'."
)

if schema == "centromeres":
return _origins_from_ucsccentromeres(df)
else:
return _origins_from_cytoband(df)

if schema == "centromeres":
return _origins_from_ucsccentromeres(df)
else:
return _origins_from_cytoband(df)
raise ValueError("Unknown provider '{}'".format(provider))


class UCSCClient:
BASE_URL = "http://hgdownload.cse.ucsc.edu/"

def __init__(self, db):
def __init__(self, db: str):
self._db = db
self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")

def fetch_chromsizes(
self,
filter_chroms=True,
chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
natsort=True,
as_bed=False,
filter_chroms: bool = True,
chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
natsort: bool = True,
as_bed: bool = False,
**kwargs,
):
) -> Union[pd.Series, pd.DataFrame]:
url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
return read_chromsizes(
url,
Expand All @@ -245,9 +270,9 @@ def fetch_chromsizes(
**kwargs,
)

def fetch_centromeres(self, **kwargs):
def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
url = urljoin(self._db_url, "database/centromeres.txt.gz")
return read_table(url, schema="centromeres")
return read_table(url, schema="centromeres", **kwargs)

def fetch_gaps(self, **kwargs):
url = urljoin(self._db_url, "database/gap.txt.gz")
Expand All @@ -258,19 +283,17 @@ def fetch_gaps(self, **kwargs):
**kwargs,
)

def fetch_cytoband(self, ideo=False, **kwargs):
def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
if ideo:
url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
else:
url = urljoin(self._db_url, "database/cytoBand.txt.gz")
return read_table(url, schema="cytoband")

def fetch_mrna(self, **kwargs):
def fetch_mrna(self, **kwargs) -> pd.DataFrame:
url = urljoin(self._db_url, "database/all_mrna.txt.gz")
return read_table(
url,
schema=UCSC_MRNA_FIELDS,
schema=SCHEMAS["all_mrna"],
**kwargs,
)


2 changes: 1 addition & 1 deletion bioframe/io/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
"vcf": VCF_FIELDS,
"jaspar": JASPAR_FIELDS,
"gap": GAP_FIELDS,
"UCSCmRNA": UCSC_MRNA_FIELDS,
"all_mrna": UCSC_MRNA_FIELDS,
"pgsnp": PGSNP_FIELDS,
}

Expand Down
File renamed without changes.
43 changes: 43 additions & 0 deletions tests/test_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
import numpy as np
import pytest

import bioframe


def test_fetch_chromsizes():
db = "hg38"
for provider in ["local", "ucsc"]:
chromsizes = bioframe.fetch_chromsizes(db, provider=provider)
assert isinstance(chromsizes, pd.Series)
assert chromsizes.name == "length"
assert len(chromsizes) == 25

chromsizes_df = bioframe.fetch_chromsizes(
db, provider=provider, as_bed=True
)
assert isinstance(chromsizes_df, pd.DataFrame)
assert list(chromsizes_df.columns) == ["chrom", "start", "end"]
assert len(chromsizes_df) == 25

# Check synonymous local assemblies
assert bioframe.fetch_chromsizes("hg38", provider="local").equals(
bioframe.fetch_chromsizes("GRCh38", provider="local")
)


def test_fetch_chromsizes_local_vs_ucsc():
for db in ["hg19", "hg38", "mm9", "mm10"]:
assert bioframe.fetch_chromsizes(db, provider="local").equals(
bioframe.fetch_chromsizes(db, provider="ucsc")
)


def test_fetch_centromeres():
for db in ["hg19", "hg38"]:
# Note: UCSC will usually have a different ordering of chromosomes
for provider in ["local", "ucsc"]:
centromeres = bioframe.fetch_centromeres(db, provider=provider)
assert isinstance(centromeres, pd.DataFrame)
assert list(centromeres.columns) == ["chrom", "start", "end", "mid"]
assert len(centromeres) == 24

0 comments on commit 561a4f2

Please sign in to comment.