From 148f49068ea959c26680218eb0986b89aa900074 Mon Sep 17 00:00:00 2001
From: Siddharth Uppal <suppal3@wisc.edu>
Date: Tue, 11 Apr 2023 16:14:30 -0500
Subject: [PATCH 01/11] =?UTF-8?q?=F0=9F=A7=91=E2=80=8D=F0=9F=94=A7=20?=
 =?UTF-8?q?=F0=9F=93=9D=20Fix=20docs=20(#323)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Pin sphinx to version 6
* readthedocs build now requires installing autometa using `pip` in .readthedocs.yml
* Add mocks for gdown, attrs, numpy, pandas, scipy, numba, skbio, trimap
* Pin docutils between 0.18 and 0.20
* Pin sphinx_rtd_theme to 1.2
---
 .readthedocs.yaml     |  2 ++
 docs/requirements.txt |  7 ++++---
 docs/source/conf.py   | 17 ++++++++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index eb39074fa..be33b59ba 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -20,3 +20,5 @@ formats: all
 python:
   install:
     - requirements: docs/requirements.txt
+    - method: pip
+      path: .  
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 5333dd0f9..9ef24e897 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,4 @@
-sphinx==4.2.0
-sphinx_rtd_theme==1.0.0
-readthedocs-sphinx-search==0.1.1
\ No newline at end of file
+sphinx==6.0
+sphinx_rtd_theme==1.2
+readthedocs-sphinx-search==0.1.1
+docutils>=0.18,<0.20
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0920262c2..4a9bfe2de 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,22 @@
 for dirpath, dirnames, filenames in os.walk("../../", topdown=True):
     sys.path.insert(0, os.path.abspath(dirpath))
 
-autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]
+autodoc_mock_imports = [
+    "Bio",
+    "hdbscan",
+    "tsne",
+    "sklearn",
+    "umap",
+    "tqdm",
+    "pandas",
+    "numpy",
+    "scipy",
+    "numba",
+    "skbio",
+    "trimap",
+    "attrs",
+    "gdown",
+]
 
 # fmt: off
 import parse_argparse

From 32f44d0fb324fbb6b25e2b040d688a5acbd7b3b0 Mon Sep 17 00:00:00 2001
From: kaw97 <31460812+kaw97@users.noreply.github.com>
Date: Fri, 21 Apr 2023 12:06:23 -0500
Subject: [PATCH 02/11] Reorder `autometa-binning` parameters in step-by-step
 tutorial (#314)

- `autometa-binning` parameter explanation is now in the same order as the commands are input
- deprecated `--domain` has been replaced with `--rank-filter-name`
---
 docs/source/step-by-step-tutorial.rst | 70 ++++++++++++++-------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/step-by-step-tutorial.rst
index 4c70aaaca..28728a970 100644
--- a/docs/source/step-by-step-tutorial.rst
+++ b/docs/source/step-by-step-tutorial.rst
@@ -637,51 +637,53 @@ Use the following command to perform binning:
         --coverages $HOME/tutorial/78mbp_metagenome.coverages.tsv \
         --gc-content $HOME/tutorial/78mbp_metagenome.gc_content.tsv \
         --markers $HOME/tutorial/78mbp_metagenome.markers.tsv \
+        --output-binning $HOME/tutorial/78mbp_metagenome.binning.tsv \
+        --output-main $HOME/tutorial/78mbp_metagenome.main.tsv \
         --clustering-method dbscan \
         --completeness 20 \
         --purity 90 \
         --cov-stddev-limit 25 \
         --gc-stddev-limit 5 \
         --taxonomy $HOME/tutorial/78mbp_metagenome.taxonomy.tsv \
-        --output-binning $HOME/tutorial/78mbp_metagenome.binning.tsv \
-        --output-main $HOME/tutorial/78mbp_metagenome.main.tsv \
         --starting-rank superkingdom \
-        --rank-filter superkingdom
+        --rank-filter superkingdom \
         --rank-name-filter bacteria
 
 Let us dissect the above command:
 
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| Flag                    | Function                                                                                | Requirement |
-+=========================+=========================================================================================+=============+
-| ``--kmers``             | Path to embedded k-mer frequencies table                                                | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--coverages``         | Path to metagenome coverages table                                                      | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--gc-content``        | Path to metagenome GC contents table                                                    | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--markers``           | Path to Autometa annotated markers table                                                | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--output-binning``    | Path to write Autometa binning results                                                  | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--output-main``       | Path to write Autometa main table                                                       | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--clustering-method`` | Clustering algorithm to use for recursive binning. Choices dbscan (default) and hdbscan | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--completeness``      | completeness cutoff to retain cluster (default 20)                                      | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--purity``            | purity cutoff to retain cluster (default 95)                                            | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--cov-stddev-limit``  | coverage standard deviation limit to retain cluster (default 25)                        | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--gc-stddev-limit``   | GC content standard deviation limit to retain cluster (default 5)                       | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--taxonomy``          | Path to Autometa assigned taxonomies table                                              | Required    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--starting-rank``     | Canonical rank at which to begin subsetting taxonomy (default: superkingdom)            | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
-| ``--domain``            | Kingdom to consider. Choices bacteria (default) and archaea                             | Optional    |
-+-------------------------+-----------------------------------------------------------------------------------------+-------------+
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| Flag                    | Function                                                                                                           | Requirement |
++=========================+====================================================================================================================+=============+
+| ``--kmers``             | Path to embedded k-mer frequencies table                                                                           | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--coverages``         | Path to metagenome coverages table                                                                                 | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--gc-content``        | Path to metagenome GC contents table                                                                               | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--markers``           | Path to Autometa annotated markers table                                                                           | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--output-binning``    | Path to write Autometa binning results                                                                             | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--output-main``       | Path to write Autometa main table                                                                                  | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--clustering-method`` | Clustering algorithm to use for recursive binning. Choices dbscan (default) and hdbscan                            | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--completeness``      | completeness cutoff to retain cluster (default 20)                                                                 | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--purity``            | purity cutoff to retain cluster (default 95)                                                                       | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--cov-stddev-limit``  | coverage standard deviation limit to retain cluster (default 25)                                                   | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--gc-stddev-limit``   | GC content standard deviation limit to retain cluster (default 5)                                                  | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--taxonomy``          | Path to Autometa assigned taxonomies table                                                                         | Required    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--starting-rank``     | Canonical rank at which to begin subsetting taxonomy (default: superkingdom)                                       | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--rank-filter``       | Canonical rank to subset by the value provided by ``--rank-name-filter`` default: superkingdom                     | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
+| ``--rank-name-filter``  | Only retrieve contigs with this value in the canonical rank column provided in ``rank-filter`` (default: bacteria) | Optional    |
++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+
 
 You can view the complete command-line options using ``autometa-binning -h``
 

From f13ee918691a87e2e89a162b6c423588d2545c36 Mon Sep 17 00:00:00 2001
From: shaneroesemann <59748289+shaneroesemann@users.noreply.github.com>
Date: Thu, 10 Aug 2023 11:03:49 -0500
Subject: [PATCH 03/11] =?UTF-8?q?=F0=9F=8E=A8=20=F0=9F=8D=8F=20Issue=20330?=
 =?UTF-8?q?=20redo=20(#338)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* implements changes by @Sidd in issue #329 in a separate new PR

* add pre commit hook to remove unused imports

* :art::green_heart: removed sed/cut changes that belong to another PR
---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b55482320..e1ba631c4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,3 +11,7 @@ repos:
       - id: end-of-file-fixer
       - id: debug-statements
       - id: check-merge-conflict
+  - repo: https://github.com/hadialqattan/pycln
+    rev: v2.1.5 # Possible releases: https://github.com/hadialqattan/pycln/releases
+    hooks:
+      - id: pycln
\ No newline at end of file

From 546b06145799a0bfcb86779c563014532259b6ea Mon Sep 17 00:00:00 2001
From: Evan Rees <25933122+WiscEvan@users.noreply.github.com>
Date: Thu, 17 Aug 2023 12:40:37 -0400
Subject: [PATCH 04/11] :fire::whale::green_heart: Fix docker-builds by
 removing pinned dependencies (#340)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 💚🐳🔥⬆️   Remove pins for scipy, scikit-learn and joblib
- 💚 🐳 Add build schedule for Autometa docker images
    > This will help to more quickly identify when builds begin failing
    > Add `nightly` tag for scheduled build
- :whale: change user workdir to `/Autometa`
---
 .github/workflows/docker_autometa.yml | 3 +++
 Dockerfile                            | 9 +++++----
 autometa-env.yml                      | 6 +++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/docker_autometa.yml b/.github/workflows/docker_autometa.yml
index 4bf3d727f..b6871839e 100644
--- a/.github/workflows/docker_autometa.yml
+++ b/.github/workflows/docker_autometa.yml
@@ -30,6 +30,8 @@ on:
     branches:
       - main
       - dev
+  schedule:
+    - cron: '0 0 * * *'  # every day at midnight
 
 jobs:
   docker_autometa:
@@ -50,6 +52,7 @@ jobs:
             type=raw,value=latest,enable=${{ endsWith(github.ref, github.event.repository.default_branch) }}
             type=raw,value={{branch}}
             type=semver,pattern={{version}}
+            type=schedule,pattern=nightly
       - name: Login to DockerHub
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v1
diff --git a/Dockerfile b/Dockerfile
index 5a7d541a0..edc2f042f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM continuumio/miniconda3
+FROM condaforge/mambaforge:latest 
 LABEL maintainer="jason.kwan@wisc.edu"
 
 # Copyright 2022 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
@@ -25,11 +25,12 @@ RUN apt-get update --allow-releaseinfo-change \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 COPY autometa-env.yml ./
-RUN conda env update -n base --file=autometa-env.yml \
-    && conda clean --all -y
+RUN mamba env update -n base --file=autometa-env.yml \
+    && mamba clean --all -y
 
 
-COPY . .
+COPY . /Autometa
+WORKDIR /Autometa
 RUN make install && make clean
 
 # NOTE: DB_DIR must be an absolute path (not a relative path)
diff --git a/autometa-env.yml b/autometa-env.yml
index d1b2b8d1d..1c93fd218 100644
--- a/autometa-env.yml
+++ b/autometa-env.yml
@@ -12,7 +12,7 @@ dependencies:
   - gdown
   - hdbscan
   - hmmer
-  - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
+  - joblib
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -24,8 +24,8 @@ dependencies:
   - rsync
   - samtools>=1.11
   - scikit-bio
-  - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
-  - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
+  - scipy
+  - scikit-learn
   - seqkit
   - tqdm
   - trimap

From 583369e0d4e6dfea3d93478b384de22b4d332cfb Mon Sep 17 00:00:00 2001
From: kaw97 <31460812+kaw97@users.noreply.github.com>
Date: Wed, 23 Aug 2023 09:31:27 -0500
Subject: [PATCH 05/11] singularity image urls (#316)

Add singularity urls for autometa 2.2.0
---
 modules/local/align_reads.nf               | 2 +-
 modules/local/binning.nf                   | 2 +-
 modules/local/binning_summary.nf           | 2 +-
 modules/local/hmmer_hmmsearch_filter.nf    | 2 +-
 modules/local/length_table.nf              | 2 +-
 modules/local/majority_vote.nf             | 2 +-
 modules/local/markers.nf                   | 2 +-
 modules/local/prepare_lca.nf               | 2 +-
 modules/local/reduce_lca.nf                | 2 +-
 modules/local/split_kingdoms.nf            | 2 +-
 modules/local/unclustered_recruitment.nf   | 2 +-
 subworkflows/local/prepare_ncbi_taxinfo.nf | 6 +++---
 subworkflows/local/prepare_nr.nf           | 4 ++--
 13 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/modules/local/align_reads.nf b/modules/local/align_reads.nf
index 1a5fd618e..472a8a25f 100644
--- a/modules/local/align_reads.nf
+++ b/modules/local/align_reads.nf
@@ -15,7 +15,7 @@ process ALIGN_READS {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/binning.nf b/modules/local/binning.nf
index a8b9ebba3..877a65014 100644
--- a/modules/local/binning.nf
+++ b/modules/local/binning.nf
@@ -11,7 +11,7 @@ process BINNING {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf
index ad5554e6d..f0c8010db 100644
--- a/modules/local/binning_summary.nf
+++ b/modules/local/binning_summary.nf
@@ -12,7 +12,7 @@ process BINNING_SUMMARY {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/hmmer_hmmsearch_filter.nf b/modules/local/hmmer_hmmsearch_filter.nf
index a21be8b3a..26d1d8cee 100644
--- a/modules/local/hmmer_hmmsearch_filter.nf
+++ b/modules/local/hmmer_hmmsearch_filter.nf
@@ -24,7 +24,7 @@ process HMMER_HMMSEARCH_FILTER {
 
     conda (params.enable_conda ? "autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/length_table.nf b/modules/local/length_table.nf
index d6b95de69..a94c76833 100644
--- a/modules/local/length_table.nf
+++ b/modules/local/length_table.nf
@@ -12,7 +12,7 @@ process LENGTH_TABLE {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/majority_vote.nf b/modules/local/majority_vote.nf
index 846773a1b..6271b7bd2 100644
--- a/modules/local/majority_vote.nf
+++ b/modules/local/majority_vote.nf
@@ -12,7 +12,7 @@ process MAJORITY_VOTE {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/markers.nf b/modules/local/markers.nf
index 5f458ad8b..5835735b7 100644
--- a/modules/local/markers.nf
+++ b/modules/local/markers.nf
@@ -13,7 +13,7 @@ process MARKERS {
 
     conda (params.enable_conda ? "autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/prepare_lca.nf b/modules/local/prepare_lca.nf
index e11015153..ce712cd3f 100644
--- a/modules/local/prepare_lca.nf
+++ b/modules/local/prepare_lca.nf
@@ -10,7 +10,7 @@ process PREPARE_LCA {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/reduce_lca.nf b/modules/local/reduce_lca.nf
index 3424d2255..031564557 100644
--- a/modules/local/reduce_lca.nf
+++ b/modules/local/reduce_lca.nf
@@ -12,7 +12,7 @@ process REDUCE_LCA {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/split_kingdoms.nf b/modules/local/split_kingdoms.nf
index f93c12fd8..396cd828e 100644
--- a/modules/local/split_kingdoms.nf
+++ b/modules/local/split_kingdoms.nf
@@ -12,7 +12,7 @@ process SPLIT_KINGDOMS {
 
     conda (params.enable_conda ? "bioconda::autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/modules/local/unclustered_recruitment.nf b/modules/local/unclustered_recruitment.nf
index dee238caf..460bf1b9a 100644
--- a/modules/local/unclustered_recruitment.nf
+++ b/modules/local/unclustered_recruitment.nf
@@ -12,7 +12,7 @@ process RECRUIT {
 
     conda (params.enable_conda ? "autometa" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/subworkflows/local/prepare_ncbi_taxinfo.nf b/subworkflows/local/prepare_ncbi_taxinfo.nf
index b11c39c7c..5b6737393 100644
--- a/subworkflows/local/prepare_ncbi_taxinfo.nf
+++ b/subworkflows/local/prepare_ncbi_taxinfo.nf
@@ -15,7 +15,7 @@ process TEST_DOWNLOAD {
 
     conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
@@ -38,7 +38,7 @@ process DOWNLOAD_ACESSION2TAXID {
 
     conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
@@ -71,7 +71,7 @@ process DOWNLOAD_TAXDUMP {
 
     conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
diff --git a/subworkflows/local/prepare_nr.nf b/subworkflows/local/prepare_nr.nf
index da70d61cc..0d1de4b68 100644
--- a/subworkflows/local/prepare_nr.nf
+++ b/subworkflows/local/prepare_nr.nf
@@ -16,7 +16,7 @@ process DOWNLOAD_NR {
 
     conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
@@ -46,7 +46,7 @@ process TEST_DOWNLOAD {
 
     conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+        container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0"
     } else {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }

From 04b2926c150536ab44bba25ac5008909b70eb953 Mon Sep 17 00:00:00 2001
From: Evan Rees <25933122+WiscEvan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 11:03:06 -0400
Subject: [PATCH 06/11] =?UTF-8?q?=F0=9F=92=9A:bug::snake:=20=E2=AC=86?=
 =?UTF-8?q?=EF=B8=8F=20Fix=20pytest=20and=20resolve=20hdbscan=20dependency?=
 =?UTF-8?q?=20(#341)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 💚:bug::snake: Fix bug in unclustered recruitment that occurs when 0 predictions pass the confidence filter
* 💚⬆️ Add pin `scikit-learn>=1.3`
* 💚🔥Remove pin `hdbscan` (hdbscan available in `sklearn.cluster` in v1.3)
* :snake::art: Replace `conda` commands with `mamba` in `Makefile`
* :memo: replace instances of conda with mamba
* :fire: Remove virtualenv commands from Makefile (resolves #331)
* :memo: Replace conda with mamba in workflows
---
 Makefile                                    |  24 ++--
 autometa-env.yml                            |   4 +-
 autometa/binning/recursive_dbscan.py        |  28 +---
 autometa/binning/unclustered_recruitment.py |  10 +-
 docs/source/bash-workflow.rst               |  25 ++--
 docs/source/benchmarking.rst                |  32 ++---
 docs/source/how-to-contribute.rst           |  10 +-
 docs/source/installation.rst                |  85 +++++++++---
 docs/source/nextflow-workflow.rst           | 145 ++++++++++----------
 docs/source/step-by-step-tutorial.rst       |   2 +-
 tests/environment.yml                       |   6 +-
 11 files changed, 204 insertions(+), 167 deletions(-)

diff --git a/Makefile b/Makefile
index c082a18d2..71b0112ba 100644
--- a/Makefile
+++ b/Makefile
@@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3
 # This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing
 TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk
 
-ifeq (,$(shell which conda))
-HAS_CONDA=False
+ifeq (,$(shell which mamba))
+HAS_MAMBA=False
 else
-HAS_CONDA=True
+HAS_MAMBA=True
 endif
 
 #################################################################################
@@ -35,20 +35,18 @@ black:
 
 ## Set up python interpreter environment
 create_environment: autometa-env.yml
-ifeq (True,$(HAS_CONDA))
-		@echo ">>> Detected conda, creating conda environment."
+ifeq (True,$(HAS_MAMBA))
+		@echo ">>> Detected mamba, creating mamba environment."
 ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
-	conda env create --file=autometa-env.yml
+	mamba env create --file=autometa-env.yml
 else
 	@echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade."
 endif
-	@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
+	@echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)"
 else
-	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
-	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
-	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
-	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
-	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
+	@echo "Mamba not detected. Please install before proceeding..."
+	@echo "Mamba docs: https://mamba.readthedocs.io/en/latest/"
+	exit
 endif
 
 #################################################################################
@@ -61,7 +59,7 @@ install: setup.py
 
 ## Install dependencies for test environment
 test_environment: tests/environment.yml
-	conda env update -n $(PROJECT_NAME) --file=$<
+	mamba env update -n $(PROJECT_NAME) --file=$<
 
 ## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:<current-branch>)
 image: Dockerfile
diff --git a/autometa-env.yml b/autometa-env.yml
index 1c93fd218..72b1b4629 100644
--- a/autometa-env.yml
+++ b/autometa-env.yml
@@ -10,9 +10,7 @@ dependencies:
   - bowtie2
   - diamond>=2.0
   - gdown
-  - hdbscan
   - hmmer
-  - joblib
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -25,7 +23,7 @@ dependencies:
   - samtools>=1.11
   - scikit-bio
   - scipy
-  - scikit-learn
+  - scikit-learn>=1.3
   - seqkit
   - tqdm
   - trimap
diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
index 713e08673..35efa2b4f 100644
--- a/autometa/binning/recursive_dbscan.py
+++ b/autometa/binning/recursive_dbscan.py
@@ -16,8 +16,7 @@
 import pandas as pd
 import numpy as np
 
-from sklearn.cluster import DBSCAN
-from hdbscan import HDBSCAN
+from sklearn.cluster import DBSCAN, HDBSCAN
 from numba import config
 
 
@@ -235,8 +234,7 @@ def run_hdbscan(
     df: pd.DataFrame,
     min_cluster_size: int,
     min_samples: int,
-    cache_dir: str = None,
-    core_dist_n_jobs: int = -1,
+    n_jobs: int = -1,
 ) -> pd.DataFrame:
     """Run clustering on `df` at provided `min_cluster_size`.
 
@@ -261,14 +259,9 @@ def run_hdbscan(
         The number of samples in a neighborhood for a point to be
         considered a core point.
 
-    cache_dir : str, optional
-        Used to cache the output of the computation of the tree.
-        By default, no caching is done. If a string is given, it is the
-        path to the caching directory.
-
-    core_dist_n_jobs: int
+    n_jobs: int
         Number of parallel jobs to run in core distance computations.
-        For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
+        For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used.
 
     Returns
     -------
@@ -304,8 +297,7 @@ def run_hdbscan(
         min_samples=min_samples,
         cluster_selection_method="leaf",
         allow_single_cluster=True,
-        memory=cache_dir,
-        core_dist_n_jobs=core_dist_n_jobs,
+        n_jobs=n_jobs,
     ).fit_predict(features_df.to_numpy())
     clusters = pd.Series(clusters, index=df.index, name="cluster")
     # NOTE: HDBSCAN labels outliers with -1
@@ -325,7 +317,7 @@ def recursive_hdbscan(
     verbose: bool = False,
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Recursively run HDBSCAN starting with defaults and iterating the min_samples
-     and min_cluster_size until only 1 cluster is recovered.
+    and min_cluster_size until only 1 cluster is recovered.
 
     Parameters
     ----------
@@ -372,14 +364,12 @@ def recursive_hdbscan(
     n_clusters = float("inf")
     best_median = float("-inf")
     best_df = pd.DataFrame()
-    cache_dir = tempfile.mkdtemp()
     while n_clusters > 1:
         binned_df = run_hdbscan(
             table,
             min_cluster_size=min_cluster_size,
             min_samples=min_samples,
-            cache_dir=cache_dir,
-            core_dist_n_jobs=n_jobs,
+            n_jobs=n_jobs,
         )
         df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df)
         filtered_df = apply_binning_metrics_filter(
@@ -403,8 +393,6 @@ def recursive_hdbscan(
             )
 
         if min_cluster_size >= max_min_cluster_size:
-            shutil.rmtree(cache_dir)
-            cache_dir = tempfile.mkdtemp()
             min_samples += 1
             min_cluster_size = 2
         else:
@@ -416,8 +404,6 @@ def recursive_hdbscan(
         if min_samples >= max_min_samples:
             max_min_cluster_size *= 2
 
-    # clean up cache now that we are out of while loop
-    shutil.rmtree(cache_dir)
     # Check our df is not empty from while loop
     if best_df.empty:
         if verbose:
diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
index 7b025a980..fa8bf9284 100644
--- a/autometa/binning/unclustered_recruitment.py
+++ b/autometa/binning/unclustered_recruitment.py
@@ -407,9 +407,13 @@ def get_confidence_filtered_predictions(
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence
     df = df[df.max(axis="columns") >= confidence_threshold]
-    filtered_predictions = df.idxmax(axis="columns")
-    filtered_predictions.name = "cluster"
-    return filtered_predictions.to_frame()
+    if df.empty:
+        filtered_predictions = pd.DataFrame(
+            [], columns=["contig", "cluster"]
+        ).set_index("contig")
+    else:
+        filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster")
+    return filtered_predictions
 
 
 def filter_contaminating_predictions(
diff --git a/docs/source/bash-workflow.rst b/docs/source/bash-workflow.rst
index 80b8a7d1d..a17a6b4d6 100644
--- a/docs/source/bash-workflow.rst
+++ b/docs/source/bash-workflow.rst
@@ -14,17 +14,16 @@ Getting Started
 Compute Environment Setup
 *************************
 
-If you have not previously installed/used Conda, you can get it using the
-Miniconda installer appropriate to your system, here: `<https://docs.conda.io/en/latest/miniconda.html>`_
+If you have not previously installed/used mamba_, you can get it from Mambaforge_.
 
-You may either create a new Conda environment named "autometa"...
+You may either create a new mamba environment named "autometa"...
 
 .. code-block:: bash
 
-    conda create -n autometa -c bioconda autometa
-    # Then, once Conda has finished creating the environment
+    mamba create -n autometa -c conda-forge -c bioconda autometa
+    # Then, once mamba has finished creating the environment
     # you may activate it:
-    conda activate autometa
+    mamba activate autometa
 
 \.\.\. or install Autometa into any of your existing environments.
 
@@ -32,13 +31,13 @@ This installs Autometa in your current active environment:
 
 .. code-block:: bash
 
-    conda install -c bioconda autometa
+    mamba install -c conda-forge -c bioconda autometa
 
 The next command installs Autometa in the provided environment:
 
 .. code-block:: bash
 
-    conda install -n <your-env-name> -c bioconda autometa
+    mamba install -n <your-env-name> -c conda-forge -c bioconda autometa
 
 Download Workflow Template
 **************************
@@ -128,7 +127,7 @@ Alignments Preparation
 .. note::
     The following example requires ``bwa``, ``kart`` and ``samtools``
 
-    ``conda install -c bioconda bwa kart samtools``
+    ``mamba install -c bioconda bwa kart samtools``
 
 .. code-block:: bash
 
@@ -158,7 +157,7 @@ ORFs
 ****
 
 .. note::
-    The following example requires ``prodigal``. e.g. ``conda install -c bioconda prodigal``
+    The following example requires ``prodigal``. e.g. ``mamba install -c bioconda prodigal``
 
 .. code-block:: bash
 
@@ -175,7 +174,7 @@ Diamond blastp Preparation
 **************************
 
 .. note::
-    The following example requires ``diamond``. e.g. ``conda install -c bioconda diamond``
+    The following example requires ``diamond``. e.g. ``mamba install -c bioconda diamond``
 
 .. code-block:: bash
 
@@ -267,7 +266,7 @@ For example, with slurm:
 
 .. caution::
 
-    Make sure your conda autometa environment is activated or the autometa entrypoints will not be available.
+    Make sure your mamba autometa environment is activated or the autometa entrypoints will not be available.
 
 Additional parameters
 #####################
@@ -323,3 +322,5 @@ See :ref:`advanced-usage-binning` section for details
 .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic
 .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 .. _metaQuast: http://quast.sourceforge.net/metaquast
+.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge
+.. _mamba: https://mamba.readthedocs.io/en/latest/
diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst
index f58eab3e6..6c7aca511 100644
--- a/docs/source/benchmarking.rst
+++ b/docs/source/benchmarking.rst
@@ -7,11 +7,11 @@ Benchmarking
 
 .. note::
 
-    The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our 
-    `KwanLab/metaBenchmarks <https://github.com/KwanLab/metaBenchmarks>`_ Github repository and provide a range of 
+    The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our
+    `KwanLab/metaBenchmarks <https://github.com/KwanLab/metaBenchmarks>`_ Github repository and provide a range of
     analyses covering multiple stages and parameter sets. These benchmarks are available with their own respective
-    modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``, 
-    ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for 
+    modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``,
+    ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for
     benchmarking based on their relevance to environmental, single-assembly, reference-free binning pipelines.
 
 Benchmarking with the ``autometa-benchmark`` module
@@ -51,7 +51,7 @@ Example benchmarking with simulated communities
 
     # Set community size (see above for selection/download of other community types)
     community_size=78Mbp
-    
+
     # Inputs
     ## NOTE: predictions and reference were downloaded using autometa-download-dataset
     predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/taxonomy.tsv.gz" # required columns -> contig, taxid
@@ -73,7 +73,7 @@ Example benchmarking with simulated communities
         --output-classification-reports $reports
 
 .. note::
-    Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp) 
+    Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp)
     from NCBI's taxdump tarball. This should be supplied using the ``--ncbi`` parameter.
 
 Clustering
@@ -95,7 +95,7 @@ Example benchmarking with simulated communities
     # Outputs
     output_wide="${community_size}.clustering_benchmarks.wide.tsv.gz"
     output_long="${community_size}.clustering_benchmarks.long.tsv.gz"
-    
+
     autometa-benchmark \
         --benchmark clustering \
         --predictions $predictions \
@@ -114,16 +114,16 @@ Example benchmarking with simulated communities
 
     # Set community size (see above for selection/download of other community types)
     community_size=78Mbp
-    
+
     # Inputs
     ## NOTE: predictions and reference were downloaded using autometa-download-dataset
     predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/binning.tsv.gz" # required columns -> contig, cluster
     reference="$HOME/Autometa/autometa/datasets/simulated/${community_size}/reference_assignments.tsv.gz"
-    
+
     # Outputs
     output_wide="${community_size}.binning_benchmarks.wide.tsv.gz"
     output_long="${community_size}.binning_benchmarks.long.tsv.gz"
-    
+
     autometa-benchmark \
         --benchmark binning-classification \
         --predictions $predictions \
@@ -172,7 +172,7 @@ Autometa is packaged with a built-in module that allows any user to download any
 To use retrieve these datasets one simply needs to run the ``autometa-download-dataset`` command.
 
 For example, to download the reference assignments for a simulated community as well as the most recent Autometa
-binning and taxon-profiling predictions for this community, provide the following parameters: 
+binning and taxon-profiling predictions for this community, provide the following parameters:
 
 .. code:: bash
 
@@ -195,15 +195,15 @@ Using ``gdrive``
 
 You can download the individual assemblies of different datasests with the help of ``gdown`` using command line
 (This is what ``autometa-download-dataset`` is using behind the scenes). If you have installed ``autometa`` using
-``conda`` then ``gdown`` should already be installed. If not, you can install it using 
-``conda install -c conda-forge gdown`` or ``pip install gdown``.
+``mamba`` then ``gdown`` should already be installed. If not, you can install it using
+``mamba install -c conda-forge gdown`` or ``pip install gdown``.
 
 Example for the 78Mbp simulated community
 """""""""""""""""""""""""""""""""""""""""
 
 1. Navigate to the 78Mbp community dataset using the `link <https://drive.google.com/drive/u/2/folders/1McxKviIzkPyr8ovj8BG7n_IYk-QfHAgG>`_ mentioned above.
-2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option. 
-    This will have a ``copy link`` button that you should use. The link for the metagenome assembly 
+2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option.
+    This will have a ``copy link`` button that you should use. The link for the metagenome assembly
     (ie. ``metagenome.fna.gz``) should look like this : ``https://drive.google.com/file/d/15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y/view?usp=sharing``
 3. The file ID is within the ``/`` forward slashes between ``file/d/`` and ``/``, e.g:
 
@@ -313,4 +313,4 @@ e.g. ``-l 1250`` would translate to 1250Mbp as the sum of total lengths for all
     # -s  : the standard deviation of DNA/RNA fragment size for paired-end simulations.
     # -l  : the length of reads to be simulated
     $ coverage = ((250 * reads) / (length * 1000000))
-    $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path
\ No newline at end of file
+    $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path
diff --git a/docs/source/how-to-contribute.rst b/docs/source/how-to-contribute.rst
index a210ccc42..9c5ddcc7a 100644
--- a/docs/source/how-to-contribute.rst
+++ b/docs/source/how-to-contribute.rst
@@ -16,10 +16,10 @@ Autometa builds documentation using `readthedocs <https://readthedocs.org/>`__.
 
 .. code-block:: bash
 
-    # Activate your autometa conda environment
-    conda activate autometa
+    # Activate your autometa mamba environment
+    mamba activate autometa
     # Install dependencies
-    conda install -n autometa -c conda-forge \
+    mamba install -n autometa -c conda-forge \
         sphinx sphinx_rtd_theme
     # List all make options
     make
@@ -38,8 +38,8 @@ You will have to install certain dependencies as well as test data to be able to
 
 .. code-block:: bash
 
-    # Activate your autometa conda environment
-    conda activate autometa
+    # Activate your autometa mamba environment
+    mamba activate autometa
     # List all make options
     make
     # Install dependencies for test environment
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 498b7087b..c32e3e3fa 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -4,8 +4,8 @@
 Installation
 ============
 
-Currently Autometa package installation is supported by conda_ and docker_.
-For installation using conda, we suggest downloading miniconda_.
+Currently Autometa package installation is supported by mamba_, and docker_.
+For installation using mamba, download mamba from Mambaforge_.
 
 .. attention::
 
@@ -14,23 +14,74 @@ For installation using conda, we suggest downloading miniconda_.
 Direct installation (Quickest)
 ==============================
 
-#. Install miniconda_
+#. Install mamba_
+
+    .. code-block:: bash
+
+        wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh"
+        bash Mambaforge-$(uname)-$(uname -m).sh
+
+    Follow the installation prompts and when you get to this:
+
+    .. code-block:: bash
+
+        Do you wish the installer to initialize Mambaforge
+        by running conda init? [yes|no]
+        [no] >>> yes
+
+    This will require restarting the terminal, or resetting
+    the terminal with the source command
+
+    .. code-block:: bash
+
+        # To resolve the comment:
+        # ==> For changes to take effect, close and re-open your current shell. <==
+        # type:
+        source ~/.bashrc
+
+    .. note::
+
+        If you already have conda installed, you can install mamba as a drop-in replacement.
+
+        .. code-block:: bash
+
+            conda -n base -c conda-forge mamba -y
+
+
 #. Create a new environment with ``autometa`` installed:
 
     .. code-block:: bash
 
-        conda create -c bioconda -n autometa autometa
+        mamba create -c conda-forge -c bioconda -n autometa autometa
+
+    .. note::
+
+            You may add the ``bioconda`` and ``conda-forge`` channels to your mamba
+            config to simplify the command.
+
+            .. code-block:: bash
+
+                mamba config --append channels bioconda
+                mamba config --append channels conda-forge
+
+            Now mamba will search the ``bioconda`` and ``conda-forge``
+            channels alongside the defaults channel.
+
+            .. code-block:: bash
+
+                mamba create -n autometa autometa
+
 
 #. Activate ``autometa`` environment:
 
     .. code-block::
 
-        conda activate autometa
+        mamba activate autometa
 
 Install from source (using make)
 ================================
 
-Download and install miniconda_. Now run the following commands:
+Download and install mamba_. Now run the following commands:
 
 .. code-block:: bash
 
@@ -43,11 +94,11 @@ Download and install miniconda_. Now run the following commands:
     # Navigate into the cloned repository
     cd Autometa
 
-    # create autometa conda environment
+    # create autometa mamba environment
     make create_environment
 
-    # activate autometa conda environment
-    conda activate autometa
+    # activate autometa mamba environment
+    mamba activate autometa
 
     # install autometa source code in autometa environment
     make install
@@ -59,7 +110,7 @@ Download and install miniconda_. Now run the following commands:
 Install from source (full commands)
 ===================================
 
-Download and install miniconda_. Now run the following commands:
+Download and install mamba_. Now run the following commands:
 
 .. code-block:: bash
 
@@ -73,10 +124,10 @@ Download and install miniconda_. Now run the following commands:
     cd Autometa
 
     # Construct the autometa environment from autometa-env.yml
-    conda env create --file=autometa-env.yml
+    mamba env create --file=autometa-env.yml
 
     # Activate environment
-    conda activate autometa
+    mamba activate autometa
 
     # Install the autometa code base from source
     python -m pip install . --ignore-installed --no-deps -vv
@@ -115,8 +166,8 @@ To run the tests, however, you'll first need to install the following packages a
 
 .. code-block:: bash
 
-    # Activate your autometa conda environment
-    conda activate autometa
+    # Activate your autometa mamba environment
+    mamba activate autometa
 
     # List all make options
     make
@@ -141,12 +192,12 @@ You can now run different unit tests using the following commands:
     make unit_test_wip
 
 .. note::
+
     As a shortcut you can also create the test environment and run **all** the unit tests using ``make unit_test`` command.
 
 For more information about the above commands see the :ref:`Contributing Guidelines` page.
 Additional unit tests are provided in the test directory. These are designed to aid in future development of autometa.
 
-.. _conda: https://docs.conda.io/en/latest/
-.. _miniconda: https://docs.conda.io/en/latest/miniconda.html
+.. _mamba: https://mamba.readthedocs.io/en/latest/index.html
+.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge
 .. _Docker: https://www.docker.com/
-.. _anaconda: https://www.anaconda.com/
diff --git a/docs/source/nextflow-workflow.rst b/docs/source/nextflow-workflow.rst
index ba097318e..ad5790e2f 100644
--- a/docs/source/nextflow-workflow.rst
+++ b/docs/source/nextflow-workflow.rst
@@ -16,12 +16,12 @@ System Requirements
 Currently the nextflow pipeline requires Docker 🐳 so it must be installed on your system.
 If you don't have Docker installed you can install it from `docs.docker.com/get-docker <https://docs.docker.com/get-docker>`_.
 We plan on removing this dependency in future versions, so that other dependency managers
-(e.g. Conda, Singularity, etc) can be used.
+(e.g. Conda, Mamba, Singularity, etc) can be used.
 
 Nextflow runs on any Posix compatible system. Detailed system requirements
 can be found in the `nextflow documentation <https://www.nextflow.io/docs/latest/getstarted.html#requirements>`_
 
-Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-conda`.
+Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-mamba`.
 
 Data Preparation
 ################
@@ -138,7 +138,7 @@ Example ``sample_sheet.csv``
 Quick Start
 ###########
 
-The following is a condensed summary of steps required to get Autometa installed, configured and running. 
+The following is a condensed summary of steps required to get Autometa installed, configured and running.
 There are links throughout to the appropriate documentation sections that can provide more detail if required.
 
 Installation
@@ -146,14 +146,14 @@ Installation
 
 For full installation instructions, please see the :ref:`installation-page` section
 
-If you would like to install Autometa via conda (I'd recommend it, its almost foolproof!), 
-you'll need to first install Miniconda on your system. You can do this in a few easy steps:
+If you would like to install Autometa via mamba (I'd recommend it, its almost foolproof!),
+you'll need to first download the Mambaforge_ installer on your system. You can do this in a few easy steps:
 
-1. Type in the following and then hit enter. This will download the Miniconda installer to your home directory.
+1. Type in the following and then hit enter. This will download the Mambaforge installer to your home directory.
 
 .. code-block:: bash
 
-    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/Miniconda3-latest-Linux-x86_64.sh
+    wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" -O "$HOME/Mambaforge-$(uname)-$(uname -m).sh"
 
 .. note::
 
@@ -163,44 +163,44 @@ you'll need to first install Miniconda on your system. You can do this in a few
 
 .. code-block:: bash
 
-    bash $HOME/Miniconda3-latest-Linux-x86_64.sh
+    bash $HOME/Mambaforge-$(uname)-$(uname -m).sh
+    # On my machine this was /home/sam/Mambaforge-latest-Linux-x86_64.sh
 
 3.	Follow all of the prompts. Keep pressing enter until it asks you to accept. Then type yes and enter. Say yes to everything.
 
-.. note:: 
+.. note::
 
-    If for whatever reason, you accidentally said no to the initialization, do not fear. 
+    If for whatever reason, you accidentally said no to the initialization, do not fear.
     We can fix this by running the initialization with the following command:
 
     .. code-block:: bash
 
-        cd $HOME/miniconda3/bin/
-        ./conda init
-    
+        $HOME/mambaforge/bin/mamba init
+
 
-4. 	Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh"
+1. 	Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh"
 
 .. code-block:: bash
-    
-    source ~/.bashrc
 
-Now that you have conda up and running, its time to install the Autometa conda environment. Run the following code:
+    source $HOME/.bashrc
+
+Now that you have mamba up and running, its time to install the Autometa mamba environment. Run the following code:
 
 .. code-block:: bash
 
-    conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml
-    
+    mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml
+
 .. attention::
 
-    You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa. 
-    Once installation is complete, the conda environment (which holds all the tools that Autometa needs) will live on your server/computer 
+    You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa.
+    Once installation is complete, the mamba environment (which holds all the tools that Autometa needs) will live on your server/computer
     much like any other program you install.
 
-Anytime you would like to run Autometa, you'll need to activate the conda environment. To activate the environment you'll need to run the following command:
+Anytime you would like to run Autometa, you'll need to activate the mamba environment. To activate the environment you'll need to run the following command:
 
 .. code-block:: bash
 
-    conda activate autometa-nf
+    mamba activate autometa-nf
 
 Configuring a scheduler
 ***********************
@@ -239,13 +239,13 @@ Then copy the following code block into that new file ("agrp" is the slurm parti
             }
         }
 
-Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory 
+Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory
 because that is a central point for me where I know I can easily find the file and it won't be moved e.g.
 :code:`/home/sam/Useful_scripts/slurm_nextflow.config`
 
-Save your new file with Ctrl+O and then exit nano with Ctrl+O. 
+Save your new file with Ctrl+O and then exit nano with Ctrl+O.
 
-Installation and set up is now complete. 🎉 🥳 
+Installation and set up is now complete. 🎉 🥳
 
 Running Autometa
 ****************
@@ -253,19 +253,19 @@ Running Autometa
 For a comprehensive list of features and options and how to use them please see :ref:`Running the pipeline`
 
 Autometa can bin one or several metagenomic datasets in one run. Regardless of the number of metagenomes you
-want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to 
+want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to
 where that data is found and how to retrieve the sample's contig coverage information.
 
-If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers. 
+If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers.
 
-If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information. 
-Full details for data preparation may be found under :ref:`sample-sheet-preparation` 
+If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information.
+Full details for data preparation may be found under :ref:`sample-sheet-preparation`
 
-First ensure that your Autometa conda environment is activated. You can activate your environment by running:
+First ensure that your Autometa mamba environment is activated. You can activate your environment by running:
 
 .. code-block:: bash
-    
-    conda activate autometa-nf
+
+    mamba activate autometa-nf
 
 Run the following code to launch Autometa:
 
@@ -275,7 +275,7 @@ Run the following code to launch Autometa:
 
 .. note::
 
-    You may want to note where you have saved your input sample sheet prior to running the launch command. 
+    You may want to note where you have saved your input sample sheet prior to running the launch command.
     It is much easier (and less error prone) to copy/paste the sample sheet file path when specifying the input (We'll get to this later in :ref:`quickstart-menu-4`).
 
 You will now use the arrow keys to move up and down between your options and hit your "Enter" or "Return" key to make your choice.
@@ -296,8 +296,8 @@ You will now use the arrow keys to move up and down between your options and hit
 Choose a version
 ----------------
 
-The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``). 
-The latest version of the tool will always be at the top of the list with older versions descending below. 
+The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``).
+The latest version of the tool will always be at the top of the list with older versions descending below.
 To select the latest version, ensure that the double, right-handed arrows are next to ``2.0.0``, then hit "Enter".
 
 .. image:: ../img/Menu1.png
@@ -311,7 +311,7 @@ Pick the ``Command line`` option.
 
 .. note::
 
-    Unless you've done some fancy server networking (i.e. tunneling and port-forwarding), 
+    Unless you've done some fancy server networking (i.e. tunneling and port-forwarding),
     or are using Autometa locally, ``Command line`` is your *only* option.
 
 .. image:: ../img/Menu2.png
@@ -321,7 +321,7 @@ Pick the ``Command line`` option.
 General nextflow parameters
 ---------------------------
 
-If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change. 
+If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change.
 If you are not using a scheduler, you may skip this step.
 
 .. image:: ../img/Menu3.png
@@ -331,12 +331,12 @@ If you are not using a scheduler, you may skip this step.
 Input and Output
 ----------------
 
-Now we need to give Autometa the full paths to our input sample sheet, output results folder 
-and output logs folder (aka where trace files are stored). 
+Now we need to give Autometa the full paths to our input sample sheet, output results folder
+and output logs folder (aka where trace files are stored).
 
 .. note::
 
-    A new folder, named by its respective sample value, will be created within the output results folder for 
+    A new folder, named by its respective sample value, will be created within the output results folder for
     each metagenome listed in the sample sheet.
 
 .. image:: ../img/Menu4.png
@@ -346,14 +346,14 @@ and output logs folder (aka where trace files are stored).
 Binning parameters
 ------------------
 
-If you're not sure what you're doing I would recommend only changing ``length_cutoff``. 
-The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning. 
+If you're not sure what you're doing I would recommend only changing ``length_cutoff``.
+The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning.
 
 .. note::
 
-    This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000. 
+    This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000.
     If your N50 is more along the lines of 5000, I would leave the cutoff at the default 3000. I would strongly recommend
-    against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was 
+    against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was
     not particularly great (the N50 is 1100bp).
 
 .. image:: ../img/Menu5.png
@@ -363,17 +363,17 @@ The default cutoff is 3000bp, which means that any contigs/scaffolds smaller tha
 Additional Autometa options
 ---------------------------
 
-Here you have a choice to make: 
+Here you have a choice to make:
 
-* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate. 
+* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate.
 
-However, this is a more computationally expensive step and will make the process take longer. 
+However, this is a more computationally expensive step and will make the process take longer.
 
 * By leaving this option as the default ``False`` option, Autometa will bin according to coverage and kmer patterns.
 
 Despite your choice, you will need to provide a path to the necessary databases using the ``single_db_dir`` option.
-In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored 
-(in my case this is :code:`/home/sam/Databases`). 
+In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored
+(in my case this is :code:`/home/sam/Databases`).
 
 For additional details on required databases, see the :ref:`Databases` section.
 
@@ -384,13 +384,13 @@ For additional details on required databases, see the :ref:`Databases` section.
 Computational parameters
 ------------------------
 
-This will depend on the computational resources you have available. You could start with the default values and see 
-how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your 
-average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a 
-starting point for a few biocrust (somewhat diverse) metagenomes. 
+This will depend on the computational resources you have available. You could start with the default values and see
+how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your
+average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a
+starting point for a few biocrust (somewhat diverse) metagenomes.
 
 .. note::
-    
+
     These options correspond to the resources provided to *each* process of Autometa, *not* the entire workflow itself.
 
     Also, for TB worth of assembled data you may want to try the :ref:`autometa-bash-workflow` using the
@@ -409,7 +409,7 @@ to prevent immediately performing the nextflow run command.
 
 .. image:: ../img/launch_choice.png
 
-If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler. 
+If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler.
 We need to include that file using the :code:`-c` flag (or configuration flag). Therefore to launch the Autometa workflow, run the following command:
 
 .. note::
@@ -433,41 +433,40 @@ Basic
 While the Autometa Nextflow pipeline can be run using Nextflow directly, we designed
 it using nf-core standards and templating to provide an easier user experience through
 use of the nf-core "tools" python library. The directions below demonstrate using a minimal
-Conda environment to install Nextflow and nf-core tools and then running the Autometa pipeline.
+mamba environment to install Nextflow and nf-core tools and then running the Autometa pipeline.
 
-.. _install-nextflow-nfcore-with-conda:
+.. _install-nextflow-nfcore-with-mamba:
 
-Installing Nextflow and nf-core tools with Conda
+Installing Nextflow and nf-core tools with mamba
 ************************************************
 
-If you have not previously installed/used Conda, you can get it using the
-Miniconda installer appropriate to your system, here: `<https://docs.conda.io/en/latest/miniconda.html>`_
+If you have not previously installed/used mamba_, you can get it from Mambaforge_.
 
-After installing conda, running the following command will create a minimal
-Conda environment named "autometa-nf", and install Nextflow and nf-core tools.
+After installing mamba, running the following command will create a minimal
+mamba environment named "autometa-nf", and install Nextflow and nf-core tools.
 
 .. code-block:: bash
 
-    conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml
+    mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml
 
 If you receive the message...
 
 .. code-block:: bash
 
-    CondaValueError: prefix already exists:
+    CondaValueError: prefix already exists: /home/user/mambaforge/envs/autometa-nf
 
 ...it means you have already created the environment. If you want to overwrite/update
 the environment then add the :code:`--force` flag to the end of the command.
 
 .. code-block:: bash
 
-    conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force
+    mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force
 
-Once Conda has finished creating the environment be sure to activate it:
+Once mamba has finished creating the environment be sure to activate it:
 
 .. code-block:: bash
 
-    conda activate autometa-nf
+    mamba activate autometa-nf
 
 
 Using nf-core
@@ -484,7 +483,7 @@ start the pipeline launch process.
     nf-core launch KwanLab/Autometa
 
 .. caution::
-    
+
     nf-core will give a list of revisions to use following the above command.
     Any of the version 1.* revisions are NOT supported.
 
@@ -543,7 +542,7 @@ The other parameter is a nextflow argument, specified with :code:`-profile`. Thi
     are able to successfully configure these profiles, please get in touch or submit a pull request and we will add these configurations
     to the repository.
 
-    - :code:`conda`: Enables running all processes using `conda <https://www.nextflow.io/docs/latest/conda.html>`_
+    - :code:`mamba`: Enables running all processes using `mamba <https://mamba.readthedocs.io/en/latest/>`_
     - :code:`singularity`: Enables running all processes using `singularity <https://www.nextflow.io/docs/latest/singularity.html>`_
     - :code:`podman`: Enables running all processes using `podman <https://www.nextflow.io/docs/latest/podman.html>`_
     - :code:`shifter`: Enables running all processes using `shifter <https://www.nextflow.io/docs/latest/shifter.html>`_
@@ -581,7 +580,7 @@ using the :code:`nextflow run ...` command by prepending the parameter name with
 
     You can run the ``KwanLab/Autometa`` project without using nf-core if you already have a correctly
     formatted parameters file. (like the one generated from ``nf-core launch ...``, i.e. ``nf-params.json``)
-    
+
     .. code-block:: bash
 
         nextflow run KwanLab/Autometa -params-file nf-params.json -profile slurm -resume
@@ -795,7 +794,7 @@ Visualizing the Workflow
 ------------------------
 
 You can visualize the entire workflow ie. create the directed acyclic graph (DAG) of processes from the written DOT file. First install
-`Graphviz <https://graphviz.org/>`_ (``conda install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the
+`Graphviz <https://graphviz.org/>`_ (``mamba install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the
 in the ``png`` format.
 
 Configuring your process executor
@@ -868,3 +867,5 @@ To use this tagged version (or any other Autometa image tag) add the argument ``
 .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic
 .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 .. _metaQuast: http://quast.sourceforge.net/metaquast
+.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge
+.. _mamba: https://mamba.readthedocs.io/en/latest/
diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/step-by-step-tutorial.rst
index 28728a970..1a2dc76e9 100644
--- a/docs/source/step-by-step-tutorial.rst
+++ b/docs/source/step-by-step-tutorial.rst
@@ -7,7 +7,7 @@
 Here is the step by step tutorial of the entire pipeline. This is helpful in case you have your own files or just want to run a specific step.
 
 Before running anything make sure you have activated the conda environment using
-``conda activate autometa``.
+``mamba activate autometa``.
 
 See the :ref:`Autometa Package Installation` page for details on setting up your conda environment.
 
diff --git a/tests/environment.yml b/tests/environment.yml
index f140f6d70..13c29b39a 100644
--- a/tests/environment.yml
+++ b/tests/environment.yml
@@ -11,9 +11,7 @@ dependencies:
   - bowtie2
   - diamond>=2.0
   - gdown
-  - hdbscan
   - hmmer
-  - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -30,8 +28,8 @@ dependencies:
   - rsync
   - samtools>=1.11
   - scikit-bio
-  - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
-  - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
+  - scipy
+  - scikit-learn>=1.3
   - sphinx
   - sphinx_rtd_theme
   - tqdm

From c8f142c81519d5743470c352f75cf76c146bcbf2 Mon Sep 17 00:00:00 2001
From: shaneroesemann <59748289+shaneroesemann@users.noreply.github.com>
Date: Thu, 24 Aug 2023 15:15:56 -0500
Subject: [PATCH 07/11] :bug::shell: Fix GTDB taxon-binning workflow (#339)

Append underscore to contig id to prevent partial matches

See also: https://github.com/KwanLab/Autometa/pull/329#issuecomment-1594996732
---
 workflows/autometa-large-data-mode.sh | 6 +++---
 workflows/autometa.sh                 | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/workflows/autometa-large-data-mode.sh b/workflows/autometa-large-data-mode.sh
index a4d65d36a..46cd58117 100644
--- a/workflows/autometa-large-data-mode.sh
+++ b/workflows/autometa-large-data-mode.sh
@@ -241,10 +241,10 @@ then
         set -x
         grep ">" $kingdom_fasta | \
             sed 's/^>//' | \
-            sed 's/$/_/' | \
-            cut -f1 -d" " > $orf_prefixes
+            cut -f1 -d" " | \
+            sed 's/$/_/' > $orf_prefixes
         # Retrieve ORF IDs from contig IDs
-        grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids
+        grep -f $orf_prefixes $orfs | cut -f1 -d" " | sed 's/^>//' > $orf_ids
         # Retrieve ORF seqs from ORF IDs
         seqkit grep \
             --pattern-file $orf_ids \
diff --git a/workflows/autometa.sh b/workflows/autometa.sh
index fda8862b1..deb1ad606 100644
--- a/workflows/autometa.sh
+++ b/workflows/autometa.sh
@@ -231,10 +231,10 @@ then
         set -x
         grep ">" $kingdom_fasta | \
             sed 's/^>//' | \
-            sed 's/$/_/' | \
-            cut -f1 -d" " > $orf_prefixes
+            cut -f1 -d" " | \
+            sed 's/$/_/' > $orf_prefixes
         # Retrieve ORF IDs from contig IDs
-        grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids
+        grep -f $orf_prefixes $orfs | cut -f1 -d" " | sed 's/^>//'  > $orf_ids
         # Retrieve ORF seqs from ORF IDs
         seqkit grep \
             --pattern-file $orf_ids \

From 5239018a87fb852fed68b7f5ddf568269c4cf910 Mon Sep 17 00:00:00 2001
From: Evan Rees <25933122+WiscEvan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 16:45:00 -0400
Subject: [PATCH 08/11] Update documentation (#342)

* :memo: Prefix step-by-step tutorial title with 'bash'
* :memo: Rename step-by-step-tutorial.rst to bash-step-by-step-tutorial.rst
---
 docs/source/autometa-python-api.rst                 |  2 +-
 ...-tutorial.rst => bash-step-by-step-tutorial.rst} | 13 ++++++++-----
 docs/source/index.rst                               |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)
 rename docs/source/{step-by-step-tutorial.rst => bash-step-by-step-tutorial.rst} (99%)

diff --git a/docs/source/autometa-python-api.rst b/docs/source/autometa-python-api.rst
index 389808d76..80db23b42 100644
--- a/docs/source/autometa-python-api.rst
+++ b/docs/source/autometa-python-api.rst
@@ -16,7 +16,7 @@ I.e. ``python -m autometa.common.kmers -h``
     Autometa has many *entrypoints* available that are utilized by the :ref:`autometa-nextflow-workflow` and :ref:`autometa-bash-workflow`. If you have installed autometa,
     all of these entrypoints will be available to you.
 
-    If you would like to get a better understanding of each entrypoint, we recommend reading the :ref:`step-by-step-tutorial` section.
+    If you would like to get a better understanding of each entrypoint, we recommend reading the :ref:`bash-step-by-step-tutorial` section.
 
 Using Autometa's Python API
 ###########################
diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/bash-step-by-step-tutorial.rst
similarity index 99%
rename from docs/source/step-by-step-tutorial.rst
rename to docs/source/bash-step-by-step-tutorial.rst
index 1a2dc76e9..c2ed60999 100644
--- a/docs/source/step-by-step-tutorial.rst
+++ b/docs/source/bash-step-by-step-tutorial.rst
@@ -1,10 +1,13 @@
-.. _step-by-step-tutorial:
+.. _bash-step-by-step-tutorial:
 
-===========================
-📓 Step by Step Tutorial 📓
-===========================
+================================
+📓 Bash Step by Step Tutorial 📓
+================================
 
-Here is the step by step tutorial of the entire pipeline. This is helpful in case you have your own files or just want to run a specific step.
+Here is the step by step tutorial of on running the entire pipeline manually through Bash.
+This is helpful in case you have your own files or just want to run a specific step.
+
+If you would like to set up a run of the whole pipeline through Bash, see the :ref:`Bash Workflow<🐚 Bash Workflow 🐚>` section.
 
 Before running anything make sure you have activated the conda environment using
 ``mamba activate autometa``.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 583fa2774..288a054b6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -17,7 +17,7 @@ Guide
    getting-started
    nextflow-workflow
    bash-workflow
-   step-by-step-tutorial
+   bash-step-by-step-tutorial
    databases
    examining-results
    benchmarking

From 98ae213ebd7ba81f5d5ea96a127c06be2bf7a968 Mon Sep 17 00:00:00 2001
From: Evan Rees <25933122+WiscEvan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:36:14 -0400
Subject: [PATCH 09/11] :snake::art::fire::bug: Fix UnboundLocalError bug
 (#325)

* :snake::art::fire::bug: Remove unnecessary nesting
* Fixes #324

UnboundLocalError resulted from trying to update `binning_checkpoints` dataframe when it was actually not available (occurs when `--cache` is *not* provided). Now variable is initialized accordingly to remove this error.
---
 autometa/binning/large_data_mode.py | 52 ++++++++++++++++-------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/autometa/binning/large_data_mode.py b/autometa/binning/large_data_mode.py
index d6dc34b3b..cac25d292 100644
--- a/autometa/binning/large_data_mode.py
+++ b/autometa/binning/large_data_mode.py
@@ -344,30 +344,34 @@ def cluster_by_taxon_partitioning(
             binning_checkpoints_fpath = os.path.join(
                 cache, "binning_checkpoints.tsv.gz"
             )
-    if binning_checkpoints_fpath:
-        if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath):
-            checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
-            binning_checkpoints = checkpoint_info["binning_checkpoints"]
-            starting_rank = checkpoint_info["starting_rank"]
-            starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
-            # Update datastructures to begin at checkpoint stage.
-            ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
-            most_recent_binning_checkpoint = (
-                binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
-            )
-            clustered_contigs = set(
-                most_recent_binning_checkpoint.index.unique().tolist()
-            )
-            most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
-                columns={starting_rank_name_txt: "cluster"}
-            )
-            num_clusters = most_recent_clustered_df.cluster.nunique()
-            clusters.append(most_recent_clustered_df)
-        else:
-            logger.debug(
-                f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
-            )
-            binning_checkpoints = pd.DataFrame()
+    if (
+        binning_checkpoints_fpath
+        and os.path.exists(binning_checkpoints_fpath)
+        and os.path.getsize(binning_checkpoints_fpath)
+    ):
+        checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
+        binning_checkpoints = checkpoint_info["binning_checkpoints"]
+        starting_rank = checkpoint_info["starting_rank"]
+        starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"]
+        # Update datastructures to begin at checkpoint stage.
+        ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations
+        most_recent_binning_checkpoint = (
+            binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna()
+        )
+        clustered_contigs = set(most_recent_binning_checkpoint.index.unique().tolist())
+        most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename(
+            columns={starting_rank_name_txt: "cluster"}
+        )
+        num_clusters = most_recent_clustered_df.cluster.nunique()
+        clusters.append(most_recent_clustered_df)
+    else:
+        logger_message = (
+            f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}"
+            if binning_checkpoints_fpath
+            else "Binning checkpoints not found. Initializing..."
+        )
+        logger.debug(logger_message)
+        binning_checkpoints = pd.DataFrame()
 
     # Subset ranks by provided (or checkpointed) starting rank
     starting_rank_index = canonical_ranks.index(starting_rank)

From bf5272988eb52a37566d75760f03ea22e8884de9 Mon Sep 17 00:00:00 2001
From: Evan Rees <25933122+WiscEvan@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:41:19 -0400
Subject: [PATCH 10/11] :bug::art::snake: Fixes #305 (#343)

:art: Add UNCLASSIFIED attribute to TaxonomyDatabase class
:art::bug::snake: Add logic to include blastp unaligned contigs to unclassified fasta

----------

Co-authored-by: WiscEvan <erees@wisc.edu>
Co-authored-by: kaw97 <kawolf2@wisc.edu>
---
 autometa/taxonomy/database.py | 26 +++++++++++++++-----------
 autometa/taxonomy/vote.py     | 13 ++++++++++++-
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/autometa/taxonomy/database.py b/autometa/taxonomy/database.py
index b3d1a1b56..b53b21177 100644
--- a/autometa/taxonomy/database.py
+++ b/autometa/taxonomy/database.py
@@ -31,7 +31,7 @@ class TaxonomyDatabase(ABC):
     class GTDB(TaxonomyDatabase):
         def __init__(self, ...):
             self.nodes = self.parse_nodes()
-            self.names = self.parse_names()            
+            self.names = self.parse_names()
             self.merged = self.parse_merged()
             self.delnodes = self.parse_delnodes()
             ...
@@ -59,6 +59,7 @@ def convert_accessions_to_taxids(self, accessions):
     Available attributes:
 
     CANONICAL_RANKS
+    UNCLASSIFIED
     """
 
     CANONICAL_RANKS = [
@@ -71,6 +72,7 @@ def convert_accessions_to_taxids(self, accessions):
         "superkingdom",
         "root",
     ]
+    UNCLASSIFIED = "unclassified"
 
     @abstractmethod
     def parse_nodes(self) -> Dict[int, Dict[str, Union[str, int]]]:
@@ -100,7 +102,7 @@ def parse_names(self) -> Dict[int, str]:
         Returns
         -------
         str
-            Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
+            Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
 
         """
 
@@ -237,7 +239,7 @@ def name(self, taxid: int, rank: str = None) -> str:
         Returns
         -------
         str
-            Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified'
+            Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED
 
         """
         try:
@@ -246,19 +248,19 @@ def name(self, taxid: int, rank: str = None) -> str:
             logger.warning(err)
             taxid = 0
         if not rank:
-            return self.names.get(taxid, "unclassified")
+            return self.names.get(taxid, TaxonomyDatabase.UNCLASSIFIED)
         if rank not in set(TaxonomyDatabase.CANONICAL_RANKS):
             logger.warning(f"{rank} not in canonical ranks!")
-            return "unclassified"
+            return TaxonomyDatabase.UNCLASSIFIED
         ancestor_taxid = taxid
         while ancestor_taxid != 1:
             ancestor_rank = self.rank(ancestor_taxid)
             if ancestor_rank == rank:
-                return self.names.get(ancestor_taxid, "unclassified")
+                return self.names.get(ancestor_taxid, TaxonomyDatabase.UNCLASSIFIED)
             ancestor_taxid = self.parent(ancestor_taxid)
         # At this point we have not encountered a name for the taxid rank
         # so we will place this as unclassified.
-        return "unclassified"
+        return TaxonomyDatabase.UNCLASSIFIED
 
     def rank(self, taxid: int) -> str:
         """
@@ -272,7 +274,7 @@ def rank(self, taxid: int) -> str:
         Returns
         -------
         str
-            rank name if taxid is found in nodes else "unclassified"
+            rank name if taxid is found in nodes else autoattribute:: autometa.taxonomy.database.TaxonomyDatabase.UNCLASSIFIED
 
         """
         try:
@@ -280,7 +282,9 @@ def rank(self, taxid: int) -> str:
         except DatabaseOutOfSyncError as err:
             logger.warning(err)
             taxid = 0
-        return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank")
+        return self.nodes.get(taxid, {"rank": TaxonomyDatabase.UNCLASSIFIED}).get(
+            "rank"
+        )
 
     def parent(self, taxid: int) -> int:
         """
@@ -368,7 +372,7 @@ def get_lineage_dataframe(
         taxids : iterable
             `taxids` whose lineage dataframe is being returned
         fillna : bool, optional
-            Whether to fill the empty cells  with 'unclassified' or not, default True
+            Whether to fill the empty cells with TaxonomyDatabase.UNCLASSIFIED or not, default True
 
         Returns
         -------
@@ -408,5 +412,5 @@ def get_lineage_dataframe(
         df = pd.DataFrame(ranked_taxids).transpose()
         df.index.name = "taxid"
         if fillna:
-            df.fillna(value="unclassified", inplace=True)
+            df.fillna(value=TaxonomyDatabase.UNCLASSIFIED, inplace=True)
         return df
diff --git a/autometa/taxonomy/vote.py b/autometa/taxonomy/vote.py
index aa6211f19..c6eeae2f2 100644
--- a/autometa/taxonomy/vote.py
+++ b/autometa/taxonomy/vote.py
@@ -272,6 +272,12 @@ def write_ranks(
     if not os.path.exists(outdir):
         os.makedirs(outdir)
     assembly_records = [record for record in SeqIO.parse(assembly, "fasta")]
+    # Include unaligned records in unclassified fasta
+    unaligned_contigs = set(
+        record.id
+        for record in SeqIO.parse(assembly, "fasta")
+        if record.id not in taxonomy.index
+    )
     fpaths = []
     for rank_name, dff in taxonomy.groupby(rank):
         # First determine the file path respective to the rank name
@@ -282,7 +288,12 @@ def write_ranks(
             rank_name_fname = ".".join([rank_name.lower(), "fna"])
         rank_name_fpath = os.path.join(outdir, rank_name_fname)
         # Now retrieve and write records respective to rank
-        records = [record for record in assembly_records if record.id in dff.index]
+        # include unaligned contigs if rank is unclassified
+        if rank_name == TaxonomyDatabase.UNCLASSIFIED:
+            contig_set = set(dff.index).union(unaligned_contigs)
+        else:
+            contig_set = dff.index
+        records = [record for record in assembly_records if record.id in contig_set]
         if not records:
             logger.warning(f"No records to write to {rank_name_fpath}")
         else:

From 2363789fc2547bb4bd9555d867d19e8969f40d81 Mon Sep 17 00:00:00 2001
From: WiscEvan <erees@wisc.edu>
Date: Thu, 24 Aug 2023 16:45:52 -0500
Subject: [PATCH 11/11] bump version to 2.2.1

---
 VERSION         | 2 +-
 nextflow.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index ccbccc3dc..c043eea77 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.2.0
+2.2.1
diff --git a/nextflow.config b/nextflow.config
index f36dedc27..369fc01f1 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -12,7 +12,7 @@ manifest {
     doi = "https://doi.org/10.1093/nar/gkz148"
     mainScript = "main.nf"
     nextflowVersion = ">=21.04.0"
-    version = "2.2.0"
+    version = "2.2.1"
 }