Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:KwanLab/Autometa into issue-#289
Browse files Browse the repository at this point in the history
  • Loading branch information
evanroyrees committed Dec 16, 2022
2 parents 646a146 + 9772257 commit ad0efd0
Show file tree
Hide file tree
Showing 30 changed files with 1,763 additions and 599 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pytest_codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.7]
python-version: [3.8]
env:
OS: ${{ matrix.os }}
PYTHON: ${{ matrix.python-version }}
Expand Down
4 changes: 3 additions & 1 deletion autometa-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- gdown
- hdbscan
- hmmer
- joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
- numba>=0.47
- numpy>=1.13
- pandas>=1.1
Expand All @@ -23,8 +24,9 @@ dependencies:
- rsync
- samtools>=1.11
- scikit-bio
- scipy==1.8 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
- scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
- scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
- seqkit
- tqdm
- trimap
- tsne
Expand Down
35 changes: 23 additions & 12 deletions autometa/binning/large_data_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from autometa.common import kmers

from autometa.common.exceptions import TableFormatError, BinningError
from autometa.taxonomy.ncbi import NCBI
from autometa.taxonomy.database import TaxonomyDatabase
from autometa.binning.recursive_dbscan import get_clusters
from autometa.binning.utilities import (
write_results,
Expand Down Expand Up @@ -306,17 +306,26 @@ def cluster_by_taxon_partitioning(
Raises
-------
TableFormatError
No marker information is availble for contigs to be binned.
No marker information is available for contigs to be binned.
FileNotFoundError
Provided `binning_checkpoints_fpath` does not exist
TableFormatError
No marker information is availble for contigs to be binned.
"""
if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath):
raise FileNotFoundError(binning_checkpoints_fpath)

if reverse_ranks:
# species, genus, family, order, class, phylum, superkingdom
canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
canonical_ranks = [
rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
]
else:
# superkingdom, phylum, class, order, family, genus, species
canonical_ranks = [
rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"
rank
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
if rank != "root"
]
# if stage is cached then we can first look to the cache before we begin subsetting main...
clustered_contigs = set()
Expand All @@ -326,15 +335,17 @@ def cluster_by_taxon_partitioning(
starting_rank_name_txt = None
# Retrieve appropriate starting canonical rank and rank_name_txt from cached binning checkpoints if cache was provided
if cache:
if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath):
raise FileNotFoundError(binning_checkpoints_fpath)
if not os.path.exists(cache):
os.makedirs(os.path.realpath(cache), exist_ok=True)
logger.debug(f"Created cache dir: {cache}")
if not os.path.isdir(cache):
raise NotADirectoryError(cache)
if not binning_checkpoints_fpath:
binning_checkpoints_fpath = os.path.join(
cache, "binning_checkpoints.tsv.gz"
)
if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(
binning_checkpoints_fpath
):
if binning_checkpoints_fpath:
if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath):
checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
binning_checkpoints = checkpoint_info["binning_checkpoints"]
starting_rank = checkpoint_info["starting_rank"]
Expand Down Expand Up @@ -392,9 +403,9 @@ def cluster_by_taxon_partitioning(
else None
)
# Create canonical rank cache outdir if it does not exist
rank_cache_outdir = os.path.join(cache, canonical_rank)
rank_cache_outdir = os.path.join(cache, canonical_rank) if cache else None
if embedding_cache_fpath and not os.path.isdir(rank_cache_outdir):
os.makedirs(rank_cache_outdir)
os.makedirs(rank_cache_outdir, exist_ok=True)
rank_embedding = get_kmer_embedding(
rank_counts,
cache_fpath=embedding_cache_fpath,
Expand Down Expand Up @@ -546,7 +557,7 @@ def cluster_by_taxon_partitioning(
num_clusters += clustered.cluster.nunique()
clusters.append(clustered)
# Cache binning at rank_name_txt stage (rank-name-txt checkpointing)
if cache:
if binning_checkpoints_fpath:
binning_checkpoints = checkpoint(
checkpoints_df=binning_checkpoints,
clustered=clustered,
Expand Down
10 changes: 7 additions & 3 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from autometa.common.markers import load as load_markers

from autometa.common.exceptions import TableFormatError, BinningError
from autometa.taxonomy.ncbi import NCBI
from autometa.taxonomy.database import TaxonomyDatabase
from autometa.binning.utilities import (
write_results,
read_annotations,
Expand Down Expand Up @@ -628,10 +628,14 @@ def taxon_guided_binning(
logger.info(f"Using {method} clustering method")
if reverse_ranks:
# species, genus, family, order, class, phylum, superkingdom
ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
ranks = [rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"]
else:
# superkingdom, phylum, class, order, family, genus, species
ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
ranks = [
rank
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
if rank != "root"
]
starting_rank_index = ranks.index(starting_rank)
ranks = ranks[starting_rank_index:]
logger.debug(f"Using ranks: {', '.join(ranks)}")
Expand Down
40 changes: 28 additions & 12 deletions autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

from Bio import SeqIO

from autometa.taxonomy.database import TaxonomyDatabase
from autometa.taxonomy.ncbi import NCBI
from autometa.taxonomy.gtdb import GTDB
from autometa.taxonomy import majority_vote
from autometa.common.markers import load as load_markers

Expand Down Expand Up @@ -226,16 +228,16 @@ def get_metabin_stats(


def get_metabin_taxonomies(
bin_df: pd.DataFrame, ncbi: NCBI, cluster_col: str = "cluster"
bin_df: pd.DataFrame, taxa_db: TaxonomyDatabase, cluster_col: str = "cluster"
) -> pd.DataFrame:
"""Retrieve taxonomies of all clusters recovered from Autometa binning.
Parameters
----------
bin_df : pd.DataFrame
Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks]
ncbi : autometa.taxonomy.ncbi.NCBI instance
Autometa NCBI class instance
taxa_db : autometa.taxonomy.ncbi.TaxonomyDatabase instance
Autometa NCBI or GTDB class instance
cluster_col : str, optional
Clustering column by which to group metabins
Expand All @@ -246,7 +248,9 @@ def get_metabin_taxonomies(
Indexed by cluster
"""
logger.info(f"Retrieving metabin taxonomies for {cluster_col}")
canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
canonical_ranks = [
rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
]
is_clustered = bin_df[cluster_col].notnull()
bin_df = bin_df[is_clustered]
outcols = [cluster_col, "length", "taxid", *canonical_ranks]
Expand Down Expand Up @@ -277,11 +281,13 @@ def get_metabin_taxonomies(
taxonomies[cluster][canonical_rank].update({taxid: length})
else:
taxonomies[cluster][canonical_rank][taxid] += length
cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi)
cluster_taxonomies = majority_vote.rank_taxids(taxonomies, taxa_db=taxa_db)
# With our cluster taxonomies, let's place these into a dataframe for easy data accession
cluster_taxa_df = pd.Series(data=cluster_taxonomies, name="taxid").to_frame()
# With the list of taxids, we'll retrieve their complete canonical-rank information
lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(), fillna=True)
lineage_df = taxa_db.get_lineage_dataframe(
cluster_taxa_df.taxid.tolist(), fillna=True
)
# Now put it all together
cluster_taxa_df = pd.merge(
cluster_taxa_df, lineage_df, how="left", left_on="taxid", right_index=True
Expand Down Expand Up @@ -323,11 +329,18 @@ def main():
required=True,
)
parser.add_argument(
"--ncbi",
help="Path to user NCBI databases directory (Required for retrieving metabin taxonomies)",
"--dbdir",
help="Path to user taxonomy database directory (Required for retrieving metabin taxonomies)",
metavar="dirpath",
required=False,
)
parser.add_argument(
"--dbtype",
help="Taxonomy database type to use (NOTE: must correspond to the same database type used during contig taxon assignment.)",
choices=["ncbi", "gtdb"],
required=False,
default="ncbi",
)
parser.add_argument(
"--binning-column",
help="Binning column to use for grouping metabins",
Expand Down Expand Up @@ -377,14 +390,17 @@ def main():
logger.info(f"Wrote metabin stats to {args.output_stats}")
# Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method.
if "taxid" in bin_df.columns:
if not args.ncbi:
if not args.dbdir:
logger.warn(
"taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..."
"taxid found in dataframe. --dbdir argument is required to retrieve metabin taxonomies. Skipping..."
)
else:
ncbi = NCBI(dirpath=args.ncbi)
if args.dbtype == "ncbi":
taxa_db = NCBI(dbdir=args.dbdir)
elif args.dbtype == "gtdb":
taxa_db = GTDB(dbdir=args.dbdir)
taxa_df = get_metabin_taxonomies(
bin_df=bin_df, ncbi=ncbi, cluster_col=args.binning_column
bin_df=bin_df, taxa_db=taxa_db, cluster_col=args.binning_column
)
taxa_df.to_csv(args.output_taxonomy, sep="\t", index=True, header=True)

Expand Down
10 changes: 7 additions & 3 deletions autometa/binning/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from typing import Iterable, Tuple

from autometa.taxonomy.ncbi import NCBI
from autometa.taxonomy.database import TaxonomyDatabase


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -98,7 +98,7 @@ def filter_taxonomy(df: pd.DataFrame, rank: str, name: str) -> pd.DataFrame:
Provided `name` not found in `rank` column.
"""
# First clean the assigned taxa by broadcasting lowercase and replacing any whitespace with underscores
for canonical_rank in NCBI.CANONICAL_RANKS:
for canonical_rank in TaxonomyDatabase.CANONICAL_RANKS:
if canonical_rank not in df.columns:
continue
df[canonical_rank] = df[canonical_rank].map(
Expand Down Expand Up @@ -395,7 +395,11 @@ def write_results(
outcols.extend(annotation_cols)
# Add in taxonomy columns if taxa are present
# superkingdom, phylum, class, order, family, genus, species
taxa_cols = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
taxa_cols = [
rank
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
if rank != "root"
]
taxa_cols.append("taxid")
# superkingdom, phylum, class, order, family, genus, species, taxid
for taxa_col in taxa_cols:
Expand Down
19 changes: 19 additions & 0 deletions autometa/common/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@
logger = logging.getLogger(__name__)


def is_gz_file(filepath) -> bool:
"""
Check if the given file is gzipped compressed or not.
Parameters
----------
filepath : str
Filepath to check
Returns
-------
bool
True if file is gzipped else False
"""
# https://stackoverflow.com/a/47080739
with open(filepath, "rb") as test_f:
return test_f.read(2) == b"\x1f\x8b"


def unpickle(fpath: str) -> Any:
"""Load a serialized `fpath` from :func:`make_pickle`.
Expand Down
Loading

0 comments on commit ad0efd0

Please sign in to comment.