Merge pull request #344 from KwanLab/release-2.2.1

Release 2.2.1
KwanLab · Aug 24, 2023 · afd1abe · afd1abe
2 parents 737fa70 + 2363789
commit afd1abe
Show file tree

Hide file tree

Showing 39 changed files with 364 additions and 277 deletions.
diff --git a/.github/workflows/docker_autometa.yml b/.github/workflows/docker_autometa.yml
@@ -30,6 +30,8 @@ on:
     branches:
       - main
       - dev
+  schedule:
+    - cron: '0 0 * * *'  # every day at midnight
 
 jobs:
   docker_autometa:
@@ -50,6 +52,7 @@ jobs:
             type=raw,value=latest,enable=${{ endsWith(github.ref, github.event.repository.default_branch) }}
             type=raw,value={{branch}}
             type=semver,pattern={{version}}
+            type=schedule,pattern=nightly
       - name: Login to DockerHub
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v1

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,3 +11,7 @@ repos:
       - id: end-of-file-fixer
       - id: debug-statements
       - id: check-merge-conflict
+  - repo: https://github.com/hadialqattan/pycln
+    rev: v2.1.5 # Possible releases: https://github.com/hadialqattan/pycln/releases
+    hooks:
+      - id: pycln
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -20,3 +20,5 @@ formats: all
 python:
   install:
     - requirements: docs/requirements.txt
+    - method: pip
+      path: .  
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM continuumio/miniconda3
+FROM condaforge/mambaforge:latest 
 LABEL maintainer="jason.kwan@wisc.edu"
 
 # Copyright 2022 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
@@ -25,11 +25,12 @@ RUN apt-get update --allow-releaseinfo-change \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 COPY autometa-env.yml ./
-RUN conda env update -n base --file=autometa-env.yml \
-    && conda clean --all -y
+RUN mamba env update -n base --file=autometa-env.yml \
+    && mamba clean --all -y
 
 
-COPY . .
+COPY . /Autometa
+WORKDIR /Autometa
 RUN make install && make clean
 
 # NOTE: DB_DIR must be an absolute path (not a relative path)

diff --git a/Makefile b/Makefile
@@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3
 # This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing
 TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk
 
-ifeq (,$(shell which conda))
-HAS_CONDA=False
+ifeq (,$(shell which mamba))
+HAS_MAMBA=False
 else
-HAS_CONDA=True
+HAS_MAMBA=True
 endif
 
 #################################################################################
@@ -35,20 +35,18 @@ black:
 
 ## Set up python interpreter environment
 create_environment: autometa-env.yml
-ifeq (True,$(HAS_CONDA))
-		@echo ">>> Detected conda, creating conda environment."
+ifeq (True,$(HAS_MAMBA))
+		@echo ">>> Detected mamba, creating mamba environment."
 ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
-	conda env create --file=autometa-env.yml
+	mamba env create --file=autometa-env.yml
 else
 	@echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade."
 endif
-	@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
+	@echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)"
 else
-	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
-	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
-	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
-	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
-	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
+	@echo "Mamba not detected. Please install before proceeding..."
+	@echo "Mamba docs: https://mamba.readthedocs.io/en/latest/"
+	exit
 endif
 
 #################################################################################
@@ -61,7 +59,7 @@ install: setup.py
 
 ## Install dependencies for test environment
 test_environment: tests/environment.yml
-	conda env update -n $(PROJECT_NAME) --file=$<
+	mamba env update -n $(PROJECT_NAME) --file=$<
 
 ## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:<current-branch>)
 image: Dockerfile

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.0
+2.2.1
diff --git a/autometa-env.yml b/autometa-env.yml
@@ -10,9 +10,7 @@ dependencies:
   - bowtie2
   - diamond>=2.0
   - gdown
-  - hdbscan
   - hmmer
-  - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -24,8 +22,8 @@ dependencies:
   - rsync
   - samtools>=1.11
   - scikit-bio
-  - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
-  - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
+  - scipy
+  - scikit-learn>=1.3
   - seqkit
   - tqdm
   - trimap

diff --git a/autometa/binning/large_data_mode.py b/autometa/binning/large_data_mode.py
@@ -344,30 +344,34 @@ def cluster_by_taxon_partitioning(
             binning_checkpoints_fpath = os.path.join(
                 cache, "binning_checkpoints.tsv.gz"
             )
-    if binning_checkpoints_fpath:
-        if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath):
-            checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
-            binning_checkpoints = checkpoint_info["binning_checkpoints"]
-            starting_rank = checkpoint_info["starting_rank"]
-            starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
-            # Update datastructures to begin at checkpoint stage.
-            ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
-            most_recent_binning_checkpoint = (
-                binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
-            )
-            clustered_contigs = set(
-                most_recent_binning_checkpoint.index.unique().tolist()
-            )
-            most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
-                columns={starting_rank_name_txt: "cluster"}
-            )
-            num_clusters = most_recent_clustered_df.cluster.nunique()
-            clusters.append(most_recent_clustered_df)
-        else:
-            logger.debug(
-                f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
-            )
-            binning_checkpoints = pd.DataFrame()
+    if (
+        binning_checkpoints_fpath
+        and os.path.exists(binning_checkpoints_fpath)
+        and os.path.getsize(binning_checkpoints_fpath)
+    ):
+        checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
+        binning_checkpoints = checkpoint_info["binning_checkpoints"]
+        starting_rank = checkpoint_info["starting_rank"]
+        starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
+        # Update datastructures to begin at checkpoint stage.
+        ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
+        most_recent_binning_checkpoint = (
+            binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
+        )
+        clustered_contigs = set(most_recent_binning_checkpoint.index.unique().tolist())
+        most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
+            columns={starting_rank_name_txt: "cluster"}
+        )
+        num_clusters = most_recent_clustered_df.cluster.nunique()
+        clusters.append(most_recent_clustered_df)
+    else:
+        logger_message = (
+            f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
+            if binning_checkpoints_fpath
+            else "Binning checkpoints not found. Initializing..."
+        )
+        logger.debug(logger_message)
+        binning_checkpoints = pd.DataFrame()
 
     # Subset ranks by provided (or checkpointed) starting rank
     starting_rank_index = canonical_ranks.index(starting_rank)

diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
@@ -16,8 +16,7 @@
 import pandas as pd
 import numpy as np
 
-from sklearn.cluster import DBSCAN
-from hdbscan import HDBSCAN
+from sklearn.cluster import DBSCAN, HDBSCAN
 from numba import config
 
 
@@ -235,8 +234,7 @@ def run_hdbscan(
     df: pd.DataFrame,
     min_cluster_size: int,
     min_samples: int,
-    cache_dir: str = None,
-    core_dist_n_jobs: int = -1,
+    n_jobs: int = -1,
 ) -> pd.DataFrame:
     """Run clustering on `df` at provided `min_cluster_size`.
 
@@ -261,14 +259,9 @@ def run_hdbscan(
         The number of samples in a neighborhood for a point to be
         considered a core point.
 
-    cache_dir : str, optional
-        Used to cache the output of the computation of the tree.
-        By default, no caching is done. If a string is given, it is the
-        path to the caching directory.
-
-    core_dist_n_jobs: int
+    n_jobs: int
         Number of parallel jobs to run in core distance computations.
-        For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
+        For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used.
 
     Returns
     -------
@@ -304,8 +297,7 @@ def run_hdbscan(
         min_samples=min_samples,
         cluster_selection_method="leaf",
         allow_single_cluster=True,
-        memory=cache_dir,
-        core_dist_n_jobs=core_dist_n_jobs,
+        n_jobs=n_jobs,
     ).fit_predict(features_df.to_numpy())
     clusters = pd.Series(clusters, index=df.index, name="cluster")
     # NOTE: HDBSCAN labels outliers with -1
@@ -325,7 +317,7 @@ def recursive_hdbscan(
     verbose: bool = False,
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Recursively run HDBSCAN starting with defaults and iterating the min_samples
-     and min_cluster_size until only 1 cluster is recovered.
+    and min_cluster_size until only 1 cluster is recovered.
 
     Parameters
     ----------
@@ -372,14 +364,12 @@ def recursive_hdbscan(
     n_clusters = float("inf")
     best_median = float("-inf")
     best_df = pd.DataFrame()
-    cache_dir = tempfile.mkdtemp()
     while n_clusters > 1:
         binned_df = run_hdbscan(
             table,
             min_cluster_size=min_cluster_size,
             min_samples=min_samples,
-            cache_dir=cache_dir,
-            core_dist_n_jobs=n_jobs,
+            n_jobs=n_jobs,
         )
         df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df)
         filtered_df = apply_binning_metrics_filter(
@@ -403,8 +393,6 @@ def recursive_hdbscan(
             )
 
         if min_cluster_size >= max_min_cluster_size:
-            shutil.rmtree(cache_dir)
-            cache_dir = tempfile.mkdtemp()
             min_samples += 1
             min_cluster_size = 2
         else:
@@ -416,8 +404,6 @@ def recursive_hdbscan(
         if min_samples >= max_min_samples:
             max_min_cluster_size *= 2
 
-    # clean up cache now that we are out of while loop
-    shutil.rmtree(cache_dir)
     # Check our df is not empty from while loop
     if best_df.empty:
         if verbose:

diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
@@ -407,9 +407,13 @@ def get_confidence_filtered_predictions(
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence
     df = df[df.max(axis="columns") >= confidence_threshold]
-    filtered_predictions = df.idxmax(axis="columns")
-    filtered_predictions.name = "cluster"
-    return filtered_predictions.to_frame()
+    if df.empty:
+        filtered_predictions = pd.DataFrame(
+            [], columns=["contig", "cluster"]
+        ).set_index("contig")
+    else:
+        filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster")
+    return filtered_predictions
 
 
 def filter_contaminating_predictions(

diff --git a/autometa/taxonomy/database.py b/autometa/taxonomy/database.py
@@ -31,7 +31,7 @@ class TaxonomyDatabase(ABC):
     class GTDB(TaxonomyDatabase):
         def __init__(self, ...):
             self.nodes = self.parse_nodes()
-            self.names = self.parse_names()            
+            self.names = self.parse_names()
             self.merged = self.parse_merged()
             self.delnodes = self.parse_delnodes()
             ...
@@ -59,6 +59,7 @@ def convert_accessions_to_taxids(self, accessions):
     Available attributes:
 
     CANONICAL_RANKS
+    UNCLASSIFIED
     """
 
     CANONICAL_RANKS = [
@@ -71,6 +72,7 @@ def convert_accessions_to_taxids(self, accessions):
         "superkingdom",
         "root",
     ]
+    UNCLASSIFIED = "unclassified"
 
     @abstractmethod
     def parse_nodes(self) -> Dict[int, Dict[str, Union[str, int]]]:
@@ -100,7 +102,7 @@ def parse_names(self) -> Dict[int, str]:
         Returns
         -------
         str
-            Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
+            Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
 
         """
 
@@ -237,7 +239,7 @@ def name(self, taxid: int, rank: str = None) -> str:
         Returns
         -------
         str
-            Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
+            Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
 
         """
         try:
@@ -246,19 +248,19 @@ def name(self, taxid: int, rank: str = None) -> str:
             logger.warning(err)
             taxid = 0
         if not rank:
-            return self.names.get(taxid, "unclassified")
+            return self.names.get(taxid, TaxonomyDatabase.UNCLASSIFIED)
         if rank not in set(TaxonomyDatabase.CANONICAL_RANKS):
             logger.warning(f"{rank} not in canonical ranks!")
-            return "unclassified"
+            return TaxonomyDatabase.UNCLASSIFIED
         ancestor_taxid = taxid
         while ancestor_taxid != 1:
             ancestor_rank = self.rank(ancestor_taxid)
             if ancestor_rank == rank:
-                return self.names.get(ancestor_taxid, "unclassified")
+                return self.names.get(ancestor_taxid, TaxonomyDatabase.UNCLASSIFIED)
             ancestor_taxid = self.parent(ancestor_taxid)
         # At this point we have not encountered a name for the taxid rank
         # so we will place this as unclassified.
-        return "unclassified"
+        return TaxonomyDatabase.UNCLASSIFIED
 
     def rank(self, taxid: int) -> str:
         """
@@ -272,15 +274,17 @@ def rank(self, taxid: int) -> str:
         Returns
         -------
         str
-            rank name if taxid is found in nodes else "unclassified"
+            rank name if taxid is found in nodes else autoattribute:: autometa.taxonomy.database.TaxonomyDatabase.UNCLASSIFIED
 
         """
         try:
             taxid = self.convert_taxid_dtype(taxid)
         except DatabaseOutOfSyncError as err:
             logger.warning(err)
             taxid = 0
-        return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank")
+        return self.nodes.get(taxid, {"rank": TaxonomyDatabase.UNCLASSIFIED}).get(
+            "rank"
+        )
 
     def parent(self, taxid: int) -> int:
         """
@@ -368,7 +372,7 @@ def get_lineage_dataframe(
         taxids : iterable
             `taxids` whose lineage dataframe is being returned
         fillna : bool, optional
-            Whether to fill the empty cells  with 'unclassified' or not, default True
+            Whether to fill the empty cells with TaxonomyDatabase.UNCLASSIFIED or not, default True
 
         Returns
         -------
@@ -408,5 +412,5 @@ def get_lineage_dataframe(
         df = pd.DataFrame(ranked_taxids).transpose()
         df.index.name = "taxid"
         if fillna:
-            df.fillna(value="unclassified", inplace=True)
+            df.fillna(value=TaxonomyDatabase.UNCLASSIFIED, inplace=True)
         return df