Skip to content

Commit

Permalink
Merge pull request #344 from KwanLab/release-2.2.1
Browse files Browse the repository at this point in the history
Release 2.2.1
  • Loading branch information
evanroyrees authored Aug 24, 2023
2 parents 737fa70 + 2363789 commit afd1abe
Show file tree
Hide file tree
Showing 39 changed files with 364 additions and 277 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/docker_autometa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ on:
branches:
- main
- dev
schedule:
- cron: '0 0 * * *' # every day at midnight

jobs:
docker_autometa:
Expand All @@ -50,6 +52,7 @@ jobs:
type=raw,value=latest,enable=${{ endsWith(github.ref, github.event.repository.default_branch) }}
type=raw,value={{branch}}
type=semver,pattern={{version}}
type=schedule,pattern=nightly
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v1
Expand Down
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ repos:
- id: end-of-file-fixer
- id: debug-statements
- id: check-merge-conflict
- repo: https://github.com/hadialqattan/pycln
rev: v2.1.5 # Possible releases: https://github.com/hadialqattan/pycln/releases
hooks:
- id: pycln
2 changes: 2 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ formats: all
python:
install:
- requirements: docs/requirements.txt
- method: pip
path: .
9 changes: 5 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM continuumio/miniconda3
FROM condaforge/mambaforge:latest
LABEL maintainer="jason.kwan@wisc.edu"

# Copyright 2022 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
Expand All @@ -25,11 +25,12 @@ RUN apt-get update --allow-releaseinfo-change \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

COPY autometa-env.yml ./
RUN conda env update -n base --file=autometa-env.yml \
&& conda clean --all -y
RUN mamba env update -n base --file=autometa-env.yml \
&& mamba clean --all -y


COPY . .
COPY . /Autometa
WORKDIR /Autometa
RUN make install && make clean

# NOTE: DB_DIR must be an absolute path (not a relative path)
Expand Down
24 changes: 11 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3
# This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing
TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk

ifeq (,$(shell which conda))
HAS_CONDA=False
ifeq (,$(shell which mamba))
HAS_MAMBA=False
else
HAS_CONDA=True
HAS_MAMBA=True
endif

#################################################################################
Expand All @@ -35,20 +35,18 @@ black:

## Set up python interpreter environment
create_environment: autometa-env.yml
ifeq (True,$(HAS_CONDA))
@echo ">>> Detected conda, creating conda environment."
ifeq (True,$(HAS_MAMBA))
@echo ">>> Detected mamba, creating mamba environment."
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
conda env create --file=autometa-env.yml
mamba env create --file=autometa-env.yml
else
@echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade."
endif
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
@echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)"
else
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
@echo "Mamba not detected. Please install before proceeding..."
@echo "Mamba docs: https://mamba.readthedocs.io/en/latest/"
exit
endif

#################################################################################
Expand All @@ -61,7 +59,7 @@ install: setup.py

## Install dependencies for test environment
test_environment: tests/environment.yml
conda env update -n $(PROJECT_NAME) --file=$<
mamba env update -n $(PROJECT_NAME) --file=$<

## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:<current-branch>)
image: Dockerfile
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.0
2.2.1
6 changes: 2 additions & 4 deletions autometa-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ dependencies:
- bowtie2
- diamond>=2.0
- gdown
- hdbscan
- hmmer
- joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
- numba>=0.47
- numpy>=1.13
- pandas>=1.1
Expand All @@ -24,8 +22,8 @@ dependencies:
- rsync
- samtools>=1.11
- scikit-bio
- scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
- scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
- scipy
- scikit-learn>=1.3
- seqkit
- tqdm
- trimap
Expand Down
52 changes: 28 additions & 24 deletions autometa/binning/large_data_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,30 +344,34 @@ def cluster_by_taxon_partitioning(
binning_checkpoints_fpath = os.path.join(
cache, "binning_checkpoints.tsv.gz"
)
if binning_checkpoints_fpath:
if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath):
checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
binning_checkpoints = checkpoint_info["binning_checkpoints"]
starting_rank = checkpoint_info["starting_rank"]
starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
# Update datastructures to begin at checkpoint stage.
## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
most_recent_binning_checkpoint = (
binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
)
clustered_contigs = set(
most_recent_binning_checkpoint.index.unique().tolist()
)
most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
columns={starting_rank_name_txt: "cluster"}
)
num_clusters = most_recent_clustered_df.cluster.nunique()
clusters.append(most_recent_clustered_df)
else:
logger.debug(
f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
)
binning_checkpoints = pd.DataFrame()
if (
binning_checkpoints_fpath
and os.path.exists(binning_checkpoints_fpath)
and os.path.getsize(binning_checkpoints_fpath)
):
checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
binning_checkpoints = checkpoint_info["binning_checkpoints"]
starting_rank = checkpoint_info["starting_rank"]
starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
# Update datastructures to begin at checkpoint stage.
## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
most_recent_binning_checkpoint = (
binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
)
clustered_contigs = set(most_recent_binning_checkpoint.index.unique().tolist())
most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
columns={starting_rank_name_txt: "cluster"}
)
num_clusters = most_recent_clustered_df.cluster.nunique()
clusters.append(most_recent_clustered_df)
else:
logger_message = (
f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
if binning_checkpoints_fpath
else "Binning checkpoints not found. Initializing..."
)
logger.debug(logger_message)
binning_checkpoints = pd.DataFrame()

# Subset ranks by provided (or checkpointed) starting rank
starting_rank_index = canonical_ranks.index(starting_rank)
Expand Down
28 changes: 7 additions & 21 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN, HDBSCAN
from numba import config


Expand Down Expand Up @@ -235,8 +234,7 @@ def run_hdbscan(
df: pd.DataFrame,
min_cluster_size: int,
min_samples: int,
cache_dir: str = None,
core_dist_n_jobs: int = -1,
n_jobs: int = -1,
) -> pd.DataFrame:
"""Run clustering on `df` at provided `min_cluster_size`.
Expand All @@ -261,14 +259,9 @@ def run_hdbscan(
The number of samples in a neighborhood for a point to be
considered a core point.
cache_dir : str, optional
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
core_dist_n_jobs: int
n_jobs: int
Number of parallel jobs to run in core distance computations.
For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used.
Returns
-------
Expand Down Expand Up @@ -304,8 +297,7 @@ def run_hdbscan(
min_samples=min_samples,
cluster_selection_method="leaf",
allow_single_cluster=True,
memory=cache_dir,
core_dist_n_jobs=core_dist_n_jobs,
n_jobs=n_jobs,
).fit_predict(features_df.to_numpy())
clusters = pd.Series(clusters, index=df.index, name="cluster")
# NOTE: HDBSCAN labels outliers with -1
Expand All @@ -325,7 +317,7 @@ def recursive_hdbscan(
verbose: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Recursively run HDBSCAN starting with defaults and iterating the min_samples
and min_cluster_size until only 1 cluster is recovered.
and min_cluster_size until only 1 cluster is recovered.
Parameters
----------
Expand Down Expand Up @@ -372,14 +364,12 @@ def recursive_hdbscan(
n_clusters = float("inf")
best_median = float("-inf")
best_df = pd.DataFrame()
cache_dir = tempfile.mkdtemp()
while n_clusters > 1:
binned_df = run_hdbscan(
table,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cache_dir=cache_dir,
core_dist_n_jobs=n_jobs,
n_jobs=n_jobs,
)
df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df)
filtered_df = apply_binning_metrics_filter(
Expand All @@ -403,8 +393,6 @@ def recursive_hdbscan(
)

if min_cluster_size >= max_min_cluster_size:
shutil.rmtree(cache_dir)
cache_dir = tempfile.mkdtemp()
min_samples += 1
min_cluster_size = 2
else:
Expand All @@ -416,8 +404,6 @@ def recursive_hdbscan(
if min_samples >= max_min_samples:
max_min_cluster_size *= 2

# clean up cache now that we are out of while loop
shutil.rmtree(cache_dir)
# Check our df is not empty from while loop
if best_df.empty:
if verbose:
Expand Down
10 changes: 7 additions & 3 deletions autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,13 @@ def get_confidence_filtered_predictions(
# Filter predictions by confidence threshold
confidence_threshold = num_classifications * confidence
df = df[df.max(axis="columns") >= confidence_threshold]
filtered_predictions = df.idxmax(axis="columns")
filtered_predictions.name = "cluster"
return filtered_predictions.to_frame()
if df.empty:
filtered_predictions = pd.DataFrame(
[], columns=["contig", "cluster"]
).set_index("contig")
else:
filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster")
return filtered_predictions


def filter_contaminating_predictions(
Expand Down
26 changes: 15 additions & 11 deletions autometa/taxonomy/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class TaxonomyDatabase(ABC):
class GTDB(TaxonomyDatabase):
def __init__(self, ...):
self.nodes = self.parse_nodes()
self.names = self.parse_names()
self.names = self.parse_names()
self.merged = self.parse_merged()
self.delnodes = self.parse_delnodes()
...
Expand Down Expand Up @@ -59,6 +59,7 @@ def convert_accessions_to_taxids(self, accessions):
Available attributes:
CANONICAL_RANKS
UNCLASSIFIED
"""

CANONICAL_RANKS = [
Expand All @@ -71,6 +72,7 @@ def convert_accessions_to_taxids(self, accessions):
"superkingdom",
"root",
]
UNCLASSIFIED = "unclassified"

@abstractmethod
def parse_nodes(self) -> Dict[int, Dict[str, Union[str, int]]]:
Expand Down Expand Up @@ -100,7 +102,7 @@ def parse_names(self) -> Dict[int, str]:
Returns
-------
str
Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
"""

Expand Down Expand Up @@ -237,7 +239,7 @@ def name(self, taxid: int, rank: str = None) -> str:
Returns
-------
str
Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
"""
try:
Expand All @@ -246,19 +248,19 @@ def name(self, taxid: int, rank: str = None) -> str:
logger.warning(err)
taxid = 0
if not rank:
return self.names.get(taxid, "unclassified")
return self.names.get(taxid, TaxonomyDatabase.UNCLASSIFIED)
if rank not in set(TaxonomyDatabase.CANONICAL_RANKS):
logger.warning(f"{rank} not in canonical ranks!")
return "unclassified"
return TaxonomyDatabase.UNCLASSIFIED
ancestor_taxid = taxid
while ancestor_taxid != 1:
ancestor_rank = self.rank(ancestor_taxid)
if ancestor_rank == rank:
return self.names.get(ancestor_taxid, "unclassified")
return self.names.get(ancestor_taxid, TaxonomyDatabase.UNCLASSIFIED)
ancestor_taxid = self.parent(ancestor_taxid)
# At this point we have not encountered a name for the taxid rank
# so we will place this as unclassified.
return "unclassified"
return TaxonomyDatabase.UNCLASSIFIED

def rank(self, taxid: int) -> str:
"""
Expand All @@ -272,15 +274,17 @@ def rank(self, taxid: int) -> str:
Returns
-------
str
rank name if taxid is found in nodes else "unclassified"
rank name if taxid is found in nodes else autoattribute:: autometa.taxonomy.database.TaxonomyDatabase.UNCLASSIFIED
"""
try:
taxid = self.convert_taxid_dtype(taxid)
except DatabaseOutOfSyncError as err:
logger.warning(err)
taxid = 0
return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank")
return self.nodes.get(taxid, {"rank": TaxonomyDatabase.UNCLASSIFIED}).get(
"rank"
)

def parent(self, taxid: int) -> int:
"""
Expand Down Expand Up @@ -368,7 +372,7 @@ def get_lineage_dataframe(
taxids : iterable
`taxids` whose lineage dataframe is being returned
fillna : bool, optional
Whether to fill the empty cells with 'unclassified' or not, default True
Whether to fill the empty cells with TaxonomyDatabase.UNCLASSIFIED or not, default True
Returns
-------
Expand Down Expand Up @@ -408,5 +412,5 @@ def get_lineage_dataframe(
df = pd.DataFrame(ranked_taxids).transpose()
df.index.name = "taxid"
if fillna:
df.fillna(value="unclassified", inplace=True)
df.fillna(value=TaxonomyDatabase.UNCLASSIFIED, inplace=True)
return df
Loading

0 comments on commit afd1abe

Please sign in to comment.