Implement local providers for fetch_chromsizes and fetch_centromeres (#…

…156) * Implement local providers for fetch_chromsizes and fetch_centromeres * Use old Union typing syntax
open2c · May 30, 2023 · 561a4f2 · 561a4f2
1 parent 380bf96
commit 561a4f2
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 91 deletions.
diff --git a/bioframe/io/resources.py b/bioframe/io/resources.py
@@ -1,23 +1,14 @@
+import urllib 
 from functools import partial
-from urllib.parse import urljoin, urlencode
-import urllib
-import os
-import posixpath as pp
-import os.path as op
+from urllib.parse import urljoin
+from typing import Union
+
 import numpy as np
 import pandas as pd
-import requests
-import socket
-import base64
-import glob
-
-import pkg_resources
 
-from .schemas import SCHEMAS, UCSC_MRNA_FIELDS
-from .fileops import (
-    read_table,
-    read_chromsizes,
-)
+from .assembly import assembly_info
+from .fileops import read_chromsizes, read_table
+from .schemas import SCHEMAS
 
 __all__ = [
     "fetch_chromsizes",
@@ -26,59 +17,84 @@
 ]
 
 
-def _check_connectivity(reference="http://www.google.com"):
-    try:
-        urllib.request.urlopen(reference, timeout=5)
-        return True
-    except urllib.request.URLError:
-        return False
-    except socket.timeout:
-        return False
-
-
 def fetch_chromsizes(
-    db,
-    provider=None,
-    filter_chroms=True,
-    chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
-    natsort=True,
-    as_bed=False,
+    db: str,
+    *,
+    provider: str = "local",
+    as_bed: bool = False,
+    filter_chroms: bool = True,
+    chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
+    natsort: bool = True,
     **kwargs,
-):
+) -> Union[pd.Series, pd.DataFrame]:
     """
-    Fetch chromsizes from the UCSC database or local storage.
+    Fetch chromsizes from local storage or the UCSC database.
 
     Parameters
     ----------
-    provider : str
-        The provider of chromsizes. Currently, only 'ucsc' is implemented.
+    db : str
+        Assembly name.
+    provider : str, optional [default: "local"]
+        The provider of chromsizes. Either "local" for local storage or "ucsc".
+    as_bed : bool, optional
+        If True, return chromsizes as an interval DataFrame (chrom, start, end)
+        instead of a Series.
+    
+    The remaining options only apply to provider="ucsc".
+    
     filter_chroms : bool, optional
         Filter for chromosome names given in ``chrom_patterns``.
     chrom_patterns : sequence, optional
         Sequence of regular expressions to capture desired sequence names.
     natsort : bool, optional
         Sort each captured group of names in natural order. Default is True.
-    as_bed : bool, optional
-        If True, return chromsizes as an interval dataframe (chrom, start, end).
     **kwargs :
         Passed to :func:`pandas.read_csv`
 
     Returns
     -------
-    Series of integer bp lengths indexed by sequence name or an interval dataframe.
+    Series of integer bp lengths indexed by sequence name or BED3 DataFrame.
 
+    Notes
+    -----
+    For more fine-grained control over the chromsizes from local storage,
+    use :func:`bioframe.assembly_info`.
+
+    Examples
+    --------
+    >>> fetch_chromsizes("hg38")
+    name
+    chr1     248956422
+    chr2     242193529
+    chr3     198295559
+    ...      ...
+    chrX     156040895
+    chrY      57227415
+    chrM         16569
+    Name: length, dtype: int64
+
+    >>> fetch_chromsizes("hg38", as_bed=True)
+            chrom      start        end
+    0        chr1          0  248956422
+    1        chr2          0  242193529
+    2        chr3          0  198295559
+    ...      ...
+    21       chrX          0  156040895
+    22       chrY          0   57227415
+    23       chrM          0      16569
+
+    See also
+    --------
+    bioframe.assembly_info
+    bioframe.UCSCClient
     """
-
     if provider == "local":
-        fpath = f"data/{db}.chrom.sizes"
-        if pkg_resources.resource_exists("bioframe.io", fpath):
-            return read_chromsizes(
-                pkg_resources.resource_filename("bioframe.io", fpath)
-            )
+        assembly = assembly_info(db)
+        if as_bed:
+            return assembly.viewframe[["chrom", "start", "end"]].copy()
         else:
-            raise LookupError(f"Assembly '{db}' not found in local storage")
-
-    if provider == "ucsc" or provider is None:
+            return assembly.chromsizes
+    elif provider == "ucsc":
         return UCSCClient(db).fetch_chromsizes(
             filter_chroms=filter_chroms,
             chrom_patterns=chrom_patterns,
@@ -90,7 +106,9 @@ def fetch_chromsizes(
         raise ValueError("Unknown provider '{}'".format(provider))
 
 
-def _origins_from_cytoband(cyb, band_col="gieStain"):
+def _origins_from_cytoband(
+    cyb: pd.DataFrame, band_col: str = "gieStain"
+) -> pd.DataFrame:
     """
     Extract chromosomal origin positions separating chromosome arms from 
     cytological band data. Takes the cytological origin, i.e. the boundary 
@@ -124,7 +142,7 @@ def _origins_from_cytoband(cyb, band_col="gieStain"):
     return pd.DataFrame.from_records(cens)
 
 
-def _origins_from_ucsccentromeres(cens):
+def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
     """
     Extract chromosomal origin positions from UCSC centromeres.txt table 
     describing centromere model sequences. Takes the midpoint of all
@@ -153,49 +171,53 @@ def _origins_from_ucsccentromeres(cens):
     return cens
 
 
-def fetch_centromeres(db, provider=None, merge=True, verbose=False):
+def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
     """
     Extract centromere locations for a given assembly 'db' from a variety
     of file formats in UCSC (cytoband, centromeres) depending on
     availability, returning a DataFrame.
 
     Parameters
     ----------
-
     db : str
-
-    merge : bool
-        Whether to merge all centromere intervals per chromosome into
-        one consolidated centromere interval.
-        Default True.
+        Assembly name.
+    provider : str, optional [default: "local"]
+        The provider of centromere data. Either "local" for local storage 
+        or "ucsc".
 
     Returns
     -------
     DataFrame with centromere 'chrom', 'start', 'end', 'mid'.
 
     Notes
     -----
-    The priority goes as
-    - Local (not implemented)
-    - centromeres.txt
-    - cytoBandIdeo
-    - cytoBand
+    When provider="local", centromeres are derived from cytoband tables
+    in local storage.
     
-    Gap files no longer provide centromere information.
+    Whe provider="ucsc", the fallback priority goes as follows:
+    - UCSC cytoBand
+    - UCSC cytoBandIdeo
+    - UCSC centromeres.txt
+    
+    Note that UCSC "gap" files no longer provide centromere information.
+
     Currently only works for human assemblies.
 
+    See also
+    --------
+    bioframe.assembly_info
+    bioframe.UCSCClient
     """
     if provider == "local":
-        raise NotImplementedError("local method not currently implemented")
-        fpath = f"data/{db}.centromeres"
-        if pkg_resources.resource_exists("bioframe.io", fpath):
-            return read_chromsizes(
-                pkg_resources.resource_filename("bioframe.io", fpath)
+        assembly = assembly_info(db)
+        cyb = assembly.cytobands
+        if cyb is None:
+            raise ValueError(
+                f"No source for centromere data found from provider '{provider}'."
             )
-        else:
-            raise LookupError(f"Centromeres for '{db}' not found in local storage")
+        return _origins_from_cytoband(cyb, band_col="stain")
 
-    if provider == "ucsc" or provider is None:
+    elif provider == "ucsc":
         client = UCSCClient(db)
         fetchers = [
             ("cytoband", client.fetch_cytoband),
@@ -210,31 +232,34 @@ def fetch_centromeres(db, provider=None, merge=True, verbose=False):
             except urllib.error.HTTPError:
                 pass
         else:
-            raise ValueError("No source for centromere data found.")
-    else:
-        raise NotImplementedError("currently UCSC is only implemented provider")
+            raise ValueError(
+               f"No source for centromere data found from provider '{provider}'."
+            )
+
+        if schema == "centromeres":
+            return _origins_from_ucsccentromeres(df)
+        else:
+            return _origins_from_cytoband(df)
 
-    if schema == "centromeres":
-        return _origins_from_ucsccentromeres(df)
     else:
-        return _origins_from_cytoband(df)
+        raise ValueError("Unknown provider '{}'".format(provider))
 
 
 class UCSCClient:
     BASE_URL = "http://hgdownload.cse.ucsc.edu/"
 
-    def __init__(self, db):
+    def __init__(self, db: str):
         self._db = db
         self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")
 
     def fetch_chromsizes(
         self,
-        filter_chroms=True,
-        chrom_patterns=(r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
-        natsort=True,
-        as_bed=False,
+        filter_chroms: bool = True,
+        chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
+        natsort: bool = True,
+        as_bed: bool = False,
         **kwargs,
-    ):
+    ) -> Union[pd.Series, pd.DataFrame]:
         url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
         return read_chromsizes(
             url,
@@ -245,9 +270,9 @@ def fetch_chromsizes(
             **kwargs,
         )
 
-    def fetch_centromeres(self, **kwargs):
+    def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
         url = urljoin(self._db_url, "database/centromeres.txt.gz")
-        return read_table(url, schema="centromeres")
+        return read_table(url, schema="centromeres", **kwargs)
 
     def fetch_gaps(self, **kwargs):
         url = urljoin(self._db_url, "database/gap.txt.gz")
@@ -258,19 +283,17 @@ def fetch_gaps(self, **kwargs):
             **kwargs,
         )
 
-    def fetch_cytoband(self, ideo=False, **kwargs):
+    def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
         if ideo:
             url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
         else:
             url = urljoin(self._db_url, "database/cytoBand.txt.gz")
         return read_table(url, schema="cytoband")
 
-    def fetch_mrna(self, **kwargs):
+    def fetch_mrna(self, **kwargs) -> pd.DataFrame:
         url = urljoin(self._db_url, "database/all_mrna.txt.gz")
         return read_table(
             url,
-            schema=UCSC_MRNA_FIELDS,
+            schema=SCHEMAS["all_mrna"],
             **kwargs,
         )
-
-
diff --git a/bioframe/io/schemas.py b/bioframe/io/schemas.py
@@ -192,7 +192,7 @@
     "vcf": VCF_FIELDS,
     "jaspar": JASPAR_FIELDS,
     "gap": GAP_FIELDS,
-    "UCSCmRNA": UCSC_MRNA_FIELDS,
+    "all_mrna": UCSC_MRNA_FIELDS,
     "pgsnp": PGSNP_FIELDS,
 }
 

diff --git a/tests/test_io.py → tests/test_fileops.py b/tests/test_io.py → tests/test_fileops.py
diff --git a/tests/test_resources.py b/tests/test_resources.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import numpy as np
+import pytest
+
+import bioframe
+
+
+def test_fetch_chromsizes():
+    db = "hg38"
+    for provider in ["local", "ucsc"]:
+        chromsizes = bioframe.fetch_chromsizes(db, provider=provider)
+        assert isinstance(chromsizes, pd.Series)
+        assert chromsizes.name == "length"
+        assert len(chromsizes) == 25
+
+        chromsizes_df = bioframe.fetch_chromsizes(
+            db, provider=provider, as_bed=True
+        )
+        assert isinstance(chromsizes_df, pd.DataFrame)
+        assert list(chromsizes_df.columns) == ["chrom", "start", "end"]
+        assert len(chromsizes_df) == 25
+
+    # Check synonymous local assemblies
+    assert bioframe.fetch_chromsizes("hg38", provider="local").equals(
+        bioframe.fetch_chromsizes("GRCh38", provider="local")
+    )
+
+
+def test_fetch_chromsizes_local_vs_ucsc():
+    for db in ["hg19", "hg38", "mm9", "mm10"]:
+        assert bioframe.fetch_chromsizes(db, provider="local").equals(
+            bioframe.fetch_chromsizes(db, provider="ucsc")
+        )
+
+
+def test_fetch_centromeres():
+    for db in ["hg19", "hg38"]:
+        # Note: UCSC will usually have a different ordering of chromosomes
+        for provider in ["local", "ucsc"]:
+            centromeres = bioframe.fetch_centromeres(db, provider=provider)
+            assert isinstance(centromeres, pd.DataFrame)
+            assert list(centromeres.columns) == ["chrom", "start", "end", "mid"]
+            assert len(centromeres) == 24